aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.github/CODE_OF_CONDUCT.md7
-rw-r--r--.github/actions/docker-build/action.yml47
-rw-r--r--.github/actions/docker-clean-dangling/action.yml18
-rw-r--r--.github/actions/docker-clean-image/action.yml19
-rw-r--r--.github/workflows/cancel-redundant-workflows.yml23
-rw-r--r--.github/workflows/compliance.yml47
-rw-r--r--.github/workflows/docker-images/README.md21
-rw-r--r--.github/workflows/docker-images/dco-check/Dockerfile13
-rw-r--r--.github/workflows/docker-images/dco-check/README.md16
-rwxr-xr-x.github/workflows/docker-images/dco-check/entrypoint.sh29
-rw-r--r--.github/workflows/docker-images/utils.sh28
-rw-r--r--.github/workflows/docker-images/yocto-builder/Dockerfile39
-rw-r--r--.github/workflows/docker-images/yocto-builder/README.md16
-rwxr-xr-x.github/workflows/docker-images/yocto-builder/entrypoint-build.sh64
-rwxr-xr-x.github/workflows/docker-images/yocto-builder/entrypoint-yocto-check-layer.sh33
-rw-r--r--.github/workflows/mirror.yml22
-rw-r--r--.github/workflows/yocto-builds.yml89
-rw-r--r--.github/workflows/yocto-layer.yml57
-rw-r--r--.readthedocs.yaml9
-rw-r--r--README.md51
-rw-r--r--classes/sdcard_image-rpi.bbclass48
-rw-r--r--conf/layer.conf6
-rw-r--r--conf/machine/include/rpi-base.inc72
-rw-r--r--conf/machine/include/rpi-default-providers.inc4
-rw-r--r--conf/machine/include/rpi-default-settings.inc4
-rw-r--r--conf/machine/include/rpi-default-versions.inc3
-rw-r--r--conf/machine/include/tune-arm1176jzf-s.inc12
-rw-r--r--conf/machine/raspberrypi-armv7.conf39
-rw-r--r--conf/machine/raspberrypi-armv8.conf45
-rw-r--r--conf/machine/raspberrypi-cm.conf2
-rw-r--r--conf/machine/raspberrypi-cm3.conf2
-rw-r--r--conf/machine/raspberrypi.conf4
-rw-r--r--conf/machine/raspberrypi0-2w-64.conf17
-rw-r--r--conf/machine/raspberrypi0-2w.conf13
-rw-r--r--conf/machine/raspberrypi0.conf2
-rw-r--r--conf/machine/raspberrypi2.conf4
-rw-r--r--conf/machine/raspberrypi3-64.conf11
-rw-r--r--conf/machine/raspberrypi3.conf2
-rw-r--r--conf/machine/raspberrypi4-64.conf9
-rw-r--r--conf/machine/raspberrypi4.conf6
-rw-r--r--conf/machine/raspberrypi5.conf26
-rw-r--r--docs/conf.py14
-rw-r--r--docs/contributing.md10
-rw-r--r--docs/extra-apps.md2
-rw-r--r--docs/extra-build-config.md191
-rw-r--r--docs/index.rst1
-rw-r--r--docs/ipcompliance.md23
-rw-r--r--docs/layer-contents.md26
-rw-r--r--docs/requirements.txt2
-rw-r--r--dynamic-layers/meta-python/recipes-connectivity/lirc/lirc/lirc-gpio-ir-0.10.patch175
-rw-r--r--dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.%.bbappend5
-rw-r--r--dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.1.bbappend6
-rw-r--r--dynamic-layers/meta-python/recipes-core/packagegroups/packagegroup-rpi-test.bbappend4
-rw-r--r--dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb10
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0001-utils-version.py-use-usr-bin-env-in-shebang.patch42
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0002-Revert-Support-compressed-pixel-formats-when-saving-.patch271
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps_git.bb44
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera.bbappend2
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera_%.bbappend2
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0001-configure-fix-linking-on-RISC-V-ISA.patch25
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0002-Revert-configure-Require-libmodplug-0.8.9.patch27
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0003-CVE-2022-41325.patch83
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0004-mmal_20.patch13826
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0005-mmal_exit_fix.patch19
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0006-mmal_chain.patch19
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0007-armv6.patch53
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0008-configure-Disable-incompatible-function-pointer-type.patch26
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0009-demux-dash-include-cstdint-needed-for-uint64_t.patch30
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2001-fix-luaL-checkint.patch236
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2002-use-vorbisidec.patch33
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3001-configure.ac-setup-for-OE-usage.patch124
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch61
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch43
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3004-use-GLESv2-headers-over-GL-headers.patch60
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3005-modules-remove-glspectrum-usage.patch149
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3006-codec-omxil_core.h-fix-multiple-definition-of.patch43
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3007-remove-xorg-related-link-libs.patch36
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3008-vo-Makefile.am-exclude-libgl_plugin.patch97
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch59
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch59
-rw-r--r--dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/rpidistro-vlc_3.0.17.bb165
-rw-r--r--dynamic-layers/networking-layer/recipes-support/drbd/drbd_%.bbappend4
-rw-r--r--dynamic-layers/openembedded-layer/recipes-core/packagegroups/packagegroup-meta-oe.bbappend2
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-blinka_6.2.2.bb (renamed from recipes-devtools/python/python3-adafruit-blinka_6.2.2.bb)11
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-busdevice_5.0.5.bb (renamed from recipes-devtools/python/python3-adafruit-circuitpython-busdevice_5.0.5.bb)5
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motor_3.2.6.bb (renamed from recipes-devtools/python/python3-adafruit-circuitpython-motor_3.2.6.bb)5
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motorkit_1.6.1.bb (renamed from recipes-devtools/python/python3-adafruit-circuitpython-motorkit_1.6.1.bb)5
-rw-r--r--dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-pca9685_3.3.4.bb (renamed from recipes-devtools/python/python3-adafruit-circuitpython-pca9685_3.3.4.bb)5
-rw-r--r--dynamic-layers/qt5-layer/recipes-qt/qt5/qtbase_%.bbappend20
-rw-r--r--img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.pngbin0 -> 277209 bytes
-rw-r--r--img/balena.pngbin7307 -> 5841 bytes
-rw-r--r--kas-poky-rpi.yml12
-rw-r--r--lib/oeqa/runtime/cases/parselogs_rpi.py6
-rw-r--r--recipes-bsp/bootfiles/rpi-bootfiles.bb11
-rw-r--r--recipes-bsp/bootfiles/rpi-cmdline.bb33
-rw-r--r--recipes-bsp/bootfiles/rpi-config/0001-config.txt-reintroduce-start_x.patch55
-rw-r--r--recipes-bsp/bootfiles/rpi-config_git.bb99
-rw-r--r--recipes-bsp/common/raspberrypi-firmware.inc10
-rw-r--r--recipes-bsp/common/raspberrypi-tools.inc12
-rw-r--r--recipes-bsp/formfactor/formfactor_%.bbappend2
-rw-r--r--recipes-bsp/gpio-shutdown/files/bind_gpio_shutdown.tab2
-rw-r--r--recipes-bsp/gpio-shutdown/files/gpio-shutdown-keymap.sh13
-rw-r--r--recipes-bsp/gpio-shutdown/gpio-shutdown.bb31
-rw-r--r--recipes-bsp/rpi-eeprom/rpi-eeprom_git.bb62
-rw-r--r--recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in4
-rw-r--r--recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb3
-rw-r--r--recipes-bsp/u-boot/files/0001-dm-core-Move-ofdata_to_platdata-call-earlier.patch51
-rw-r--r--recipes-bsp/u-boot/files/0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch52
-rw-r--r--recipes-bsp/u-boot/u-boot_%.bbappend14
-rw-r--r--recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch13
-rw-r--r--recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch16
-rw-r--r--recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch13
-rw-r--r--recipes-connectivity/bluez5/bluez5/0004-Move-the-43xx-firmware-into-lib-firmware.patch25
-rw-r--r--recipes-connectivity/bluez5/bluez5/0004-Move-the-hciattach-firmware-into-lib-firmware.patch31
-rw-r--r--recipes-connectivity/bluez5/bluez5_%.bbappend8
-rw-r--r--recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch2
-rw-r--r--recipes-connectivity/pi-bluetooth/pi-bluetooth_0.1.17.bb20
-rw-r--r--recipes-core/images/rpi-basic-image.bb15
-rw-r--r--recipes-core/images/rpi-hwup-image.bb11
-rw-r--r--recipes-core/images/rpi-test-image.bb2
-rw-r--r--recipes-core/packagegroups/packagegroup-core-tools-testapps.bbappend2
-rw-r--r--recipes-core/packagegroups/packagegroup-rpi-test.bb8
-rw-r--r--recipes-core/psplash/files/framebuf.conf4
-rw-r--r--recipes-core/psplash/psplash_%.bbappend14
-rw-r--r--recipes-core/udev/udev-rules-rpi.bb7
-rw-r--r--recipes-core/udev/udev-rules-rpi/99-com.rules21
-rw-r--r--recipes-core/udev/udev-rules-udisks-rpi_1.0.bb2
-rw-r--r--recipes-devtools/bcm2835/bcm2835_1.52.bb42
-rw-r--r--recipes-devtools/bcm2835/bcm2835_1.73.bb49
-rw-r--r--recipes-devtools/pi-blaster/pi-blaster_git.bb10
-rw-r--r--recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.10.bb (renamed from recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.4.bb)7
-rw-r--r--recipes-devtools/python/python3-adafruit-platformdetect_3.27.0.bb (renamed from recipes-devtools/python/python3-adafruit-platformdetect_3.1.1.bb)7
-rw-r--r--recipes-devtools/python/python3-adafruit-pureio_1.1.9.bb (renamed from recipes-devtools/python/python3-adafruit-pureio_1.1.8.bb)6
-rw-r--r--recipes-devtools/python/python3-rtimu/0001-setup.py-Port-to-use-setuptools.patch29
-rw-r--r--recipes-devtools/python/python3-rtimu_7.2.1.bb (renamed from recipes-devtools/python/python3-rtimu_git.bb)4
-rw-r--r--recipes-devtools/python/rpi-gpio_0.7.0.bb18
-rw-r--r--recipes-devtools/python/rpi-gpio_0.7.1.bb15
-rw-r--r--recipes-devtools/python/rpio/0001-include-sys-types.h-explicitly-for-getting-caddr_t-d.patch30
-rw-r--r--recipes-devtools/python/rpio_0.10.1.bb (renamed from recipes-devtools/python/rpio_0.10.0.bb)15
-rw-r--r--recipes-devtools/raspi-gpio/raspi-gpio_git.bb2
-rw-r--r--recipes-graphics/cairo/cairo_%.bbappend2
-rw-r--r--recipes-graphics/kmscube/kmscube_%.bbappend2
-rw-r--r--recipes-graphics/libsdl2/libsdl2_%.bbappend4
-rw-r--r--recipes-graphics/libva/libva_%.bbappend2
-rw-r--r--recipes-graphics/mesa/libglu_%.bbappend2
-rw-r--r--recipes-graphics/mesa/mesa-demos_%.bbappend5
-rw-r--r--recipes-graphics/mesa/mesa-gl_%.bbappend8
-rw-r--r--recipes-graphics/mesa/mesa_%.bbappend4
-rw-r--r--recipes-graphics/piglit/piglit_%.bbappend4
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch2
-rw-r--r--recipes-graphics/raspidmx/raspidmx_git.bb4
-rw-r--r--recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch2
-rw-r--r--recipes-graphics/userland/files/0001-mmal-Do-not-use-Werror.patch33
-rw-r--r--recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch2
-rw-r--r--recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch2
-rw-r--r--recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch2
-rw-r--r--recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch2
-rw-r--r--recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch2
-rw-r--r--recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch2
-rw-r--r--recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch2
-rw-r--r--recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch2
-rw-r--r--recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch2
-rw-r--r--recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch2
-rw-r--r--recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch2
-rw-r--r--recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch2
-rw-r--r--recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch2
-rw-r--r--recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch2
-rw-r--r--recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch2
-rw-r--r--recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch2
-rw-r--r--recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch2
-rw-r--r--recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch2
-rw-r--r--recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch2
-rw-r--r--recipes-graphics/userland/files/0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch725
-rw-r--r--recipes-graphics/userland/userland_git.bb36
-rw-r--r--recipes-graphics/vc-graphics/vc-graphics.inc8
-rw-r--r--recipes-graphics/wayland/wayland_%.bbappend2
-rw-r--r--recipes-graphics/wayland/weston-init.bbappend10
-rw-r--r--recipes-graphics/wayland/weston_%.bbappend4
-rw-r--r--recipes-graphics/xorg-xserver/xserver-xf86-config_%.bbappend8
-rw-r--r--recipes-graphics/xorg-xserver/xserver-xorg_%.bbappend4
-rw-r--r--recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb66
-rw-r--r--recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro/0001-Default-43455-firmware-to-standard-variant.patch28
-rw-r--r--recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb156
-rw-r--r--recipes-kernel/linux/files/0001-Revert-selftests-bpf-Skip-perf-hw-events-test-if-the.patch35
-rw-r--r--recipes-kernel/linux/files/0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch50
-rw-r--r--recipes-kernel/linux/files/0002-Revert-selftests-bpf-Fix-perf_buffer-test-on-systems.patch94
-rw-r--r--recipes-kernel/linux/files/default-cpu-governor.cfg9
-rw-r--r--recipes-kernel/linux/files/raspberrypi4/rpi4-nvmem.cfg1
-rw-r--r--recipes-kernel/linux/files/rpi.scc1
-rw-r--r--recipes-kernel/linux/files/wm8960.cfg2
-rw-r--r--recipes-kernel/linux/linux-raspberrypi-dev.bb2
-rw-r--r--recipes-kernel/linux/linux-raspberrypi-v7.inc13
-rw-r--r--recipes-kernel/linux/linux-raspberrypi-v7_5.15.bb6
-rw-r--r--recipes-kernel/linux/linux-raspberrypi-v7_6.1.bb6
-rw-r--r--recipes-kernel/linux/linux-raspberrypi-v7_6.6.bb6
-rw-r--r--recipes-kernel/linux/linux-raspberrypi.inc34
-rw-r--r--recipes-kernel/linux/linux-raspberrypi_5.10.bb19
-rw-r--r--recipes-kernel/linux/linux-raspberrypi_5.15.bb32
-rw-r--r--recipes-kernel/linux/linux-raspberrypi_5.4.bb23
-rw-r--r--recipes-kernel/linux/linux-raspberrypi_6.1.bb31
-rw-r--r--recipes-kernel/linux/linux-raspberrypi_6.6.bb31
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch2
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch2
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch2
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch2
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-omx_%.bbappend10
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend4
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-plugins-base_%.bbappend6
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_%.bbappend1
-rw-r--r--recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_1.18.%.bbappend1
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0002-Libraries-and-headers-from-ffmpeg-are-installed-in-u.patch16
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch2
-rw-r--r--recipes-multimedia/omxplayer/omxplayer_git.bb26
-rw-r--r--recipes-multimedia/picamera-libs/picamera-libs.bb26
-rw-r--r--recipes-multimedia/python3-picamera/python3-picamera_git.bb22
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch292
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch34
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch30
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch68341
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff22
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch82
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch111
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch45
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch35
-rw-r--r--recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb198
-rw-r--r--recipes-multimedia/x264/x264_%.bbappend4
-rw-r--r--recipes-sato/libwpe_%.bbappend2
-rw-r--r--wic/sdimage-raspberrypi.wks2
240 files changed, 87896 insertions, 1061 deletions
diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..f3e3d70
--- /dev/null
+++ b/.github/CODE_OF_CONDUCT.md
@@ -0,0 +1,7 @@
+## Code of Conduct
+
+This project has adopted the [Contributor
+Covenant](https://www.contributor-covenant.org/). For details, see the full
+text [here](https://www.contributor-covenant.org/version/2/1/code_of_conduct/).
+For more information, additional questions or comments contact the project's
+maintainers.
diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml
new file mode 100644
index 0000000..b91668e
--- /dev/null
+++ b/.github/actions/docker-build/action.yml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+name: "Build a docker image"
+
+inputs:
+ docker_image:
+ required: true
+ description: "The name of the docker image"
+ id:
+ required: true
+ description: "Namespace for the image"
+
+runs:
+ using: "composite"
+ steps:
+ - name: Build the ${{ inputs.docker_image }} docker image
+ shell: bash
+ # We run this unconditionally even if the change doesn't touch the
+ # relevant docker files because there is a chance that another PR (or
+ # something else) rebuilt the local image. For example if the first
+ # version of the PR included change for the relevant docker image but a
+ # subsequent push to the PR branch dropped them. In this way we rebuild
+ # the image to avoid using the changes from the previous push.
+ run: |
+ cd .github/workflows/docker-images/
+ # We build a temporary image namespaced by the PR number so we can
+ # handle multiple runners on the same host using the same docker
+ # storage.
+ tries=3
+ n=1
+ until [ "$n" -gt "$tries" ]; do
+ echo "Building the docker image ${{ inputs.docker_image }}-${{ inputs.id }}... try $n..."
+ if docker build . -f "${{ inputs.docker_image }}/Dockerfile" -t "${{ inputs.docker_image }}-${{ inputs.id }}"; then
+ # This can fail if a dangling images cleaning job runs in
+ # parallel. So we try this a couple of times to minimize
+ # conflict. This is because while building, docker creates a
+ # untagged image first (dangling) before tagging it at the end.
+ # If between these two operations a dangling cleanup happens,
+ # build fails.
+ break
+ fi
+ n=$((n+1))
+ done
+ [ "$n" -lt "$tries" ]
+ echo "Temporary image built in ${{ inputs.docker_image }}."
diff --git a/.github/actions/docker-clean-dangling/action.yml b/.github/actions/docker-clean-dangling/action.yml
new file mode 100644
index 0000000..90595c8
--- /dev/null
+++ b/.github/actions/docker-clean-dangling/action.yml
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+name: "Cleanup dangling docker images"
+
+runs:
+ using: "composite"
+ steps:
+ - name: Cleanup dangling images
+ shell: bash
+ run: |
+ echo -n "Cleanup dangling images... "
+ if ! docker rmi $(docker images --filter "dangling=true" -q --no-trunc) > /dev/null 2>&1; then
+ echo "no dangling images found."
+ else
+ echo "done."
+ fi
diff --git a/.github/actions/docker-clean-image/action.yml b/.github/actions/docker-clean-image/action.yml
new file mode 100644
index 0000000..dfc18d9
--- /dev/null
+++ b/.github/actions/docker-clean-image/action.yml
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+name: "Cleanup docker storage"
+
+inputs:
+ docker_image:
+ required: true
+ description: "The name of the docker image"
+
+runs:
+ using: "composite"
+ steps:
+ - name: Cleanup temporary image
+ shell: bash
+ run: |
+ echo "Cleanup ${{ inputs.docker_image }} image..."
+ docker rmi "${{ inputs.docker_image }}" || true
diff --git a/.github/workflows/cancel-redundant-workflows.yml b/.github/workflows/cancel-redundant-workflows.yml
new file mode 100644
index 0000000..556317d
--- /dev/null
+++ b/.github/workflows/cancel-redundant-workflows.yml
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+name: Cancel redundant workflows
+
+on:
+ workflow_run:
+ workflows:
+ - "Builds"
+ - "Compliance"
+ - "Yocto Compatible"
+ types:
+ - requested
+
+jobs:
+ cancel-redundant-workflows:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: styfle/cancel-workflow-action@0.10.0
+ with:
+ all_but_latest: true
+ workflow_id: ${{ github.event.workflow.id }}
diff --git a/.github/workflows/compliance.yml b/.github/workflows/compliance.yml
new file mode 100644
index 0000000..ec489f0
--- /dev/null
+++ b/.github/workflows/compliance.yml
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+name: Compliance
+
+on:
+ pull_request:
+
+jobs:
+ dco:
+ name: DCO
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout the code
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - name: Build a temporary DCO image
+ uses: ./.github/actions/docker-build
+ with:
+ docker_image: dco-check
+ id: ${{ github.event.number }}
+ - name: Do DCO check
+ run: |
+ docker run --rm -v "$GITHUB_WORKSPACE:/work:ro" \
+ --env "BASE_REF=$GITHUB_BASE_REF" \
+ "dco-check-${{ github.event.number }}"
+ - name: Cleanup temporary docker image
+ uses: ./.github/actions/docker-clean-image
+ with:
+ docker_image: dco-check-${{ github.event.number }}
+ if: always()
+ - name: Cleanup dangling docker images
+ uses: ./.github/actions/docker-clean-dangling
+ if: always()
+ reuse:
+ name: reuse
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout the code
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - name: Do reuse check
+ continue-on-error: true
+ uses: fsfe/reuse-action@v1
diff --git a/.github/workflows/docker-images/README.md b/.github/workflows/docker-images/README.md
new file mode 100644
index 0000000..86cfddc
--- /dev/null
+++ b/.github/workflows/docker-images/README.md
@@ -0,0 +1,21 @@
+<!--
+SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+
+SPDX-License-Identifier: MIT
+-->
+
+# Docker images for CI
+
+Each directory contains the files for a docker image.
+
+## Building an image
+
+When building a docker image, the build context is expected to be where this
+`README.md` file resides. This means that building the images will require
+passing the appropriate `-f` argument.
+
+Here is an example for building the `dco-check` image:
+
+```
+docker build . -f dco-check/Dockerfile -t dco-check
+```
diff --git a/.github/workflows/docker-images/dco-check/Dockerfile b/.github/workflows/docker-images/dco-check/Dockerfile
new file mode 100644
index 0000000..89901ae
--- /dev/null
+++ b/.github/workflows/docker-images/dco-check/Dockerfile
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+FROM christophebedard/dco-check:latest
+
+# Run under normal user called 'ci'
+RUN useradd --create-home --uid 1000 --shell /usr/bin/bash ci
+USER ci
+
+COPY ./dco-check/entrypoint.sh /
+COPY ./utils.sh /
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/.github/workflows/docker-images/dco-check/README.md b/.github/workflows/docker-images/dco-check/README.md
new file mode 100644
index 0000000..bf53241
--- /dev/null
+++ b/.github/workflows/docker-images/dco-check/README.md
@@ -0,0 +1,16 @@
+<!--
+SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+
+SPDX-License-Identifier: MIT
+-->
+
+# Docker image for DCO checks
+
+This image provides the environment and the logic of running a DCO check
+against a repository.
+
+## Configuration
+
+The `entrypoint.sh` script assumes at runtime that the repository to be checked
+is available under `/work`. This path is to be populated via bind mounts when
+running the container.
diff --git a/.github/workflows/docker-images/dco-check/entrypoint.sh b/.github/workflows/docker-images/dco-check/entrypoint.sh
new file mode 100755
index 0000000..af2c507
--- /dev/null
+++ b/.github/workflows/docker-images/dco-check/entrypoint.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+set -e
+
+# shellcheck disable=SC1091
+. /utils.sh
+
+GIT_REPO_PATH="/work"
+
+[ -n "$BASE_REF" ] ||
+ error "DCO checks needs to know the target branch. Make sure that is set in BASE_REF."
+[ -d "$GIT_REPO_PATH/.git" ] ||
+ error "Can't find a git checkout under $GIT_REPO_PATH ."
+cd "$GIT_REPO_PATH"
+
+# The GitHub runner user and the container user might differ making git error
+# out with:
+# error: fatal: detected dubious ownership in repository at '/work'
+# Avoid this as the security risk is minimum here while guarding the git hooks
+# via PRs.
+git config --global --add safe.directory /work
+
+dco-check \
+ --verbose \
+ --default-branch "origin/$BASE_REF"
diff --git a/.github/workflows/docker-images/utils.sh b/.github/workflows/docker-images/utils.sh
new file mode 100644
index 0000000..66bdb09
--- /dev/null
+++ b/.github/workflows/docker-images/utils.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+_log() {
+ _level="$1"
+ _msg="$2"
+ echo "[$_level] $_msg"
+}
+
+error() {
+ _msg="$1"
+ _log "ERR" "$1"
+ exit 1
+}
+
+warn() {
+ _msg="$1"
+ _log "WRN" "$1"
+ exit 1
+}
+
+log() {
+ _msg="$1"
+ _log "LOG" "$1"
+}
diff --git a/.github/workflows/docker-images/yocto-builder/Dockerfile b/.github/workflows/docker-images/yocto-builder/Dockerfile
new file mode 100644
index 0000000..87221b9
--- /dev/null
+++ b/.github/workflows/docker-images/yocto-builder/Dockerfile
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+FROM ubuntu:20.04
+
+ARG DEBIAN_FRONTEND="noninteractive"
+RUN apt-get update -qq
+RUN apt-get install -y eatmydata
+
+# Yocto/OE build host dependencies
+# Keep this in sync with
+# https://git.yoctoproject.org/poky/tree/documentation/poky.yaml
+RUN eatmydata apt-get install -qq -y \
+ gawk wget git diffstat unzip texinfo gcc build-essential chrpath \
+ socat cpio python3 python3-pip python3-pexpect xz-utils debianutils \
+ iputils-ping python3-git python3-jinja2 libegl1-mesa libsdl1.2-dev \
+ pylint3 xterm python3-subunit mesa-common-dev zstd liblz4-tool
+
+# en_US.UTF-8 is required by the build system
+RUN eatmydata apt-get install -qq -y locales \
+ && echo "en_US.UTF-8 UTF-8" > /etc/locale.gen \
+ && locale-gen
+ENV LANG en_US.utf8
+
+RUN eatmydata apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Have bash as shell
+RUN echo "dash dash/sh boolean false" | debconf-set-selections \
+ && dpkg-reconfigure dash
+
+# Run under normal user called 'ci'
+RUN useradd --create-home --uid 1000 --shell /usr/bin/bash ci
+USER ci
+WORKDIR /home/ci
+
+COPY ./yocto-builder/entrypoint-yocto-check-layer.sh /
+COPY ./yocto-builder/entrypoint-build.sh /
+COPY ./utils.sh /
diff --git a/.github/workflows/docker-images/yocto-builder/README.md b/.github/workflows/docker-images/yocto-builder/README.md
new file mode 100644
index 0000000..6336fb8
--- /dev/null
+++ b/.github/workflows/docker-images/yocto-builder/README.md
@@ -0,0 +1,16 @@
+<!--
+SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+
+SPDX-License-Identifier: MIT
+-->
+
+# Docker image for builds
+
+This defines the docker image for running Yocto/OE based operations/builds. It
+privides multiple scripts for driving different operations.
+
+## Configuration
+
+The `entrypoint` scripts assumes at runtime that the repository to drive the
+operation against is available under `/work`. This path is to be populated via
+bind mounts when running the container.
diff --git a/.github/workflows/docker-images/yocto-builder/entrypoint-build.sh b/.github/workflows/docker-images/yocto-builder/entrypoint-build.sh
new file mode 100755
index 0000000..65999d0
--- /dev/null
+++ b/.github/workflows/docker-images/yocto-builder/entrypoint-build.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+set -ex
+
+# shellcheck disable=SC1091
+. /utils.sh
+
+META_RASPBERRYPI_PATH="/work"
+
+[ -n "$BASE_REF" ] ||
+ error "Target branch is needed. Make sure that is set in BASE_REF."
+[ -d "$META_RASPBERRYPI_PATH/.git" ] ||
+ error "Can't find a git checkout under $META_RASPBERRYPI_PATH ."
+[ -n "$MACHINE" ] ||
+ error "Machine to be used for build not provided."
+[ -n "$IMAGE" ] ||
+ error "Image to build not provided."
+
+TEMP_DIR="$(mktemp -d)"
+cd "$TEMP_DIR"
+
+REPOS=" \
+ git://git.yoctoproject.org/poky.git \
+"
+for repo in $REPOS; do
+ log "Cloning $repo on branch $BASE_REF..."
+ git clone --depth 1 --branch "$BASE_REF" "$repo"
+done
+
+# shellcheck disable=SC1091,SC2240
+. ./poky/oe-init-build-env build
+
+# Build configuration
+printf "\n# ------ ci ------\n" >> conf/local.conf
+[ -z "$SSTATE_DIR" ] || echo SSTATE_DIR = \""$SSTATE_DIR"\" >> conf/local.conf
+[ -z "$DL_DIR" ] || echo DL_DIR = \""$DL_DIR"\" >> conf/local.conf
+[ -z "$DISTRO" ] || echo DISTRO = \""$DISTRO"\" >> conf/local.conf
+cat <<EOCONF >>conf/local.conf
+BB_NUMBER_THREADS = "6"
+PARALLEL_MAKE = "-j 6"
+# unmerged-usr is deprecated
+# https://lore.kernel.org/all/3f2f03085301d22854e5429019fb010f27d98bc7.camel@linuxfoundation.org/t/
+DISTRO_FEATURES:append = " systemd usrmerge"
+VIRTUAL-RUNTIME_init_manager = "systemd"
+DISTRO_FEATURES_BACKFILL_CONSIDERED:append = " sysvinit"
+VIRTUAL-RUNTIME_initscripts = "systemd-compat-units"
+LICENSE_FLAGS_ACCEPTED = "synaptics-killswitch"
+EOCONF
+
+# Add the BSP layer
+bitbake-layers add-layer "$META_RASPBERRYPI_PATH"
+
+# Log configs for debugging purposes
+for f in 'conf/local.conf' 'conf/bblayers.conf'; do
+ printf "\n------ %s ------\n" "$f"
+ cat "$f"
+done
+
+# Fire!
+MACHINE="$MACHINE" bitbake "$IMAGE"
diff --git a/.github/workflows/docker-images/yocto-builder/entrypoint-yocto-check-layer.sh b/.github/workflows/docker-images/yocto-builder/entrypoint-yocto-check-layer.sh
new file mode 100755
index 0000000..474a24e
--- /dev/null
+++ b/.github/workflows/docker-images/yocto-builder/entrypoint-yocto-check-layer.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+set -ex
+
+# shellcheck disable=SC1091
+. /utils.sh
+
+GIT_REPO_PATH="/work"
+
+[ -n "$BASE_REF" ] ||
+ error "Target branch is needed. Make sure that is set in BASE_REF."
+[ -d "$GIT_REPO_PATH/.git" ] ||
+ error "Can't find a git checkout under $GIT_REPO_PATH ."
+
+TEMP_DIR="$(mktemp -d)"
+cd "$TEMP_DIR"
+
+REPOS=" \
+ git://git.yoctoproject.org/poky.git \
+"
+for repo in $REPOS; do
+ log "Cloning $repo on branch $BASE_REF..."
+ git clone --depth 1 --branch "$BASE_REF" "$repo"
+done
+
+# shellcheck disable=SC1091,SC2240
+. ./poky/oe-init-build-env build
+yocto-check-layer --with-software-layer-signature-check --debug \
+ "$GIT_REPO_PATH"
diff --git a/.github/workflows/mirror.yml b/.github/workflows/mirror.yml
new file mode 100644
index 0000000..d9e3cde
--- /dev/null
+++ b/.github/workflows/mirror.yml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+name: Mirrors
+
+on: [ push, delete, create ]
+
+concurrency:
+ group: git-mirror-me
+
+jobs:
+ yocto-mirror:
+ name: Yocto Git Mirror
+ runs-on: ubuntu-latest
+ steps:
+ - uses: agherzan/git-mirror-me-action@11f54c7186724daafbe5303b5075954f1a19a63e
+ env:
+ GMM_SSH_PRIVATE_KEY: ${{ secrets.YOCTO_META_RASPBERRYPI_SSH_PRIVATE_KEY }}
+ GMM_SSH_KNOWN_HOSTS: ${{ secrets.YOCTO_META_RASPBERRYPI_SSH_KNOWN_HOSTS }}
+ GMM_DST_REPO: "ssh://git@push.yoctoproject.org/meta-raspberrypi"
+ GMM_DEBUG: "1"
diff --git a/.github/workflows/yocto-builds.yml b/.github/workflows/yocto-builds.yml
new file mode 100644
index 0000000..408d25e
--- /dev/null
+++ b/.github/workflows/yocto-builds.yml
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+---
+
+name: Builds
+
+on:
+ pull_request:
+
+jobs:
+ build:
+ strategy:
+ fail-fast: true
+ matrix:
+ machine:
+ - raspberrypi
+ - raspberrypi0-2w-64
+ - raspberrypi0-2w
+ - raspberrypi0
+ - raspberrypi0-wifi
+ - raspberrypi2
+ - raspberrypi3-64
+ - raspberrypi3
+ - raspberrypi4-64
+ - raspberrypi4
+ - raspberrypi5
+ - raspberrypi-cm3
+ - raspberrypi-cm
+ - raspberrypi-armv7
+ - raspberrypi-armv8
+ image: [rpi-test-image]
+ distro: [poky]
+ runs-on: [self-hosted, Linux]
+ name: ${{ matrix.machine }}/${{ matrix.image }}/poky/systemd
+ env:
+ DL_DIR: /var/lib/ci/yocto/downloads
+ SSTATE_DIR: /var/lib/ci/yocto/sstate
+ steps:
+ - name: Checkout the code
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - name: Define Yocto build files
+ id: changed-files-specific
+ uses: tj-actions/changed-files@v24
+ with:
+ files: |
+ .github/actions/**
+ .github/workflows/docker-images/yocto-builder/**
+ .github/workflows/docker-images/*.sh
+ .github/workflows/yocto-builds.yml
+ classes/**
+ conf/**
+ dynamic-layers/**
+ files/**
+ lib/**
+ recipes-**
+ wic/**
+ - name: Build a temporary yocto-builder image
+ uses: ./.github/actions/docker-build
+ with:
+ docker_image: yocto-builder
+ id: ${{ github.event.number }}
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
+ - name: Build the image
+ run: |
+ docker run --rm \
+ -v "$GITHUB_WORKSPACE:/work:ro" \
+ -v "$DL_DIR:$DL_DIR:rw" \
+ -v "$SSTATE_DIR:$SSTATE_DIR:rw" \
+ --env "BASE_REF=$GITHUB_BASE_REF" \
+ --env "MACHINE=${{ matrix.machine }}" \
+ --env "DISTRO=${{ matrix.distro }}" \
+ --env "IMAGE=${{ matrix.image }}" \
+ --env "DL_DIR=$DL_DIR" \
+ --env "SSTATE_DIR=$SSTATE_DIR" \
+ "yocto-builder-${{ github.event.number }}" \
+ /entrypoint-build.sh
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
+ - name: Cleanup temporary docker image
+ uses: ./.github/actions/docker-clean-image
+ with:
+ docker_image: yocto-builder-${{ github.event.number }}
+ if: always()
+ - name: Cleanup dangling docker images
+ uses: ./.github/actions/docker-clean-dangling
+ if: always()
diff --git a/.github/workflows/yocto-layer.yml b/.github/workflows/yocto-layer.yml
new file mode 100644
index 0000000..fa11815
--- /dev/null
+++ b/.github/workflows/yocto-layer.yml
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+---
+
+name: Yocto Compatible
+
+on:
+ pull_request:
+
+jobs:
+ yocto-check-layer:
+ name: Validate with yocto-check-layer
+ runs-on: [self-hosted, Linux]
+ steps:
+ - name: Checkout the code
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+ - name: Define Yocto build files
+ id: changed-files-specific
+ uses: tj-actions/changed-files@v24
+ with:
+ files: |
+ .github/actions/**
+ .github/workflows/docker-images/yocto-builder/**
+ .github/workflows/docker-images/*.sh
+ .github/workflows/yocto-builds.yml
+ classes/**
+ conf/**
+ dynamic-layers/**
+ files/**
+ lib/**
+ recipes-**
+ wic/**
+ - name: Build a temporary yocto-builder image
+ uses: ./.github/actions/docker-build
+ with:
+ docker_image: yocto-builder
+ id: ${{ github.event.number }}
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
+ - name: Run yocto-check-layer
+ run: |
+ docker run --rm -v "$GITHUB_WORKSPACE:/work:ro" \
+ --env "BASE_REF=$GITHUB_BASE_REF" \
+ "yocto-builder-${{ github.event.number }}" \
+ /entrypoint-yocto-check-layer.sh
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
+ - name: Cleanup temporary docker image
+ uses: ./.github/actions/docker-clean-image
+ with:
+ docker_image: yocto-builder-${{ github.event.number }}
+ if: always()
+ - name: Cleanup dangling docker images
+ uses: ./.github/actions/docker-clean-dangling
+ if: always()
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..5e8dc20
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,9 @@
+version: 2
+
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.7"
+python:
+ install:
+ - requirements: docs/requirements.txt
diff --git a/README.md b/README.md
index 7e95cc4..f00d151 100644
--- a/README.md
+++ b/README.md
@@ -2,22 +2,31 @@
Yocto BSP layer for the Raspberry Pi boards - <http://www.raspberrypi.org/>.
-[![Build Status](https://yocto-ci.resin.io/job/meta-raspberrypi1/badge/icon)](https://yocto-ci.resin.io/job/meta-raspberrypi1)
-[![Build Status](https://yocto-ci.resin.io/job/meta-raspberrypi2/badge/icon)](https://yocto-ci.resin.io/job/meta-raspberrypi2)
-[![Build Status](https://yocto-ci.resin.io/job/meta-raspberrypi3/badge/icon)](https://yocto-ci.resin.io/job/meta-raspberrypi3)
-[![Build Status](https://yocto-ci.resin.io/job/meta-raspberrypi4/badge/icon)](https://yocto-ci.resin.io/job/meta-raspberrypi4)
[![Documentation Status](https://readthedocs.org/projects/meta-raspberrypi/badge/?version=latest)](https://meta-raspberrypi.readthedocs.io/en/latest/?badge=latest)
-[![Matrix](https://img.shields.io/matrix/meta-raspberrypi:cub.icu.svg?server_fqdn=matrix.cub.icu)](https://matrix.to/#/#meta-raspberrypi:cub.icu)
-
-| | |
-|:-: | :-: |
-| Build server sponsored by | [![balena.io](./img/balena.png)](https://www.balena.io/). |
+[![Matrix](https://img.shields.io/badge/chat-meta--raspberrypi-brightgreen)](https://matrix.to/#/#meta-raspberrypi:matrix.org)
+
+<table border="0" rules="none">
+<tr border="0">
+<td width="140" height="100" align="center">
+ <br />
+ <a href="https://www.yoctoproject.org/ecosystem/branding/">
+ <img alt="Yocto Project Layer Compatible" src="img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.png">
+ </a>
+</td>
+<td width="150" height="100" align="center">
+ Sponsored by:<br />
+ <a href="https://balena.io">
+ <img alt="balena.io" src="img/balena.png">
+ </a>
+</td>
+</tr>
+</table>
## Quick links
* Git repository web frontend:
<https://github.com/agherzan/meta-raspberrypi>
-* Mailing list (yocto mailing list): <yocto@yoctoproject.org>
+* Mailing list (yocto mailing list): <yocto@lists.yoctoproject.org>
* Issues management (Github Issues):
<https://github.com/agherzan/meta-raspberrypi/issues>
* Documentation: <http://meta-raspberrypi.readthedocs.io/en/latest/>
@@ -35,6 +44,12 @@ OpenEmbedded/Yocto distributions and layer stacks, such as:
* Yoe Disto (Video and Camera Products).
* Yocto/Poky (main focus of testing).
+## Yocto Project Compatible Layer
+
+This layer is officially approved as part of the `Yocto Project Compatible
+Layers Program`. You can find details of that on the official Yocto Project
+[website](https://www.yoctoproject.org/software-overview/layers/?searchTerm=meta-raspberrypi).
+
## Dependencies
This layer depends on:
@@ -43,11 +58,6 @@ This layer depends on:
* branch: master
* revision: HEAD
-* URI: git://git.openembedded.org/meta-openembedded
- * layers: meta-oe, meta-multimedia, meta-networking, meta-python
- * branch: master
- * revision: HEAD
-
## Quick Start
1. source poky/oe-init-build-env rpi-build
@@ -77,8 +87,15 @@ local_conf_header:
To configure the machine, you have to update the `machine` variable.
And the same for the `distro`.
-For further information, you can read more at <https://kas.readthedocs.io/en/1.0/index.html>
+For further information, you can read more at <https://kas.readthedocs.io/en/latest/index.html>
+
+## Contributing
+
+You can send patches using the GitHub pull request process or/and through the
+Yocto mailing list. Refer to the
+[documentation](https://meta-raspberrypi.readthedocs.io/en/latest/contributing.html)
+for more information.
## Maintainers
-* Andrei Gherzan `<andrei at gherzan.ro>`
+* Andrei Gherzan `<andrei at gherzan.com>`
diff --git a/classes/sdcard_image-rpi.bbclass b/classes/sdcard_image-rpi.bbclass
index e803a0d..ddcd69d 100644
--- a/classes/sdcard_image-rpi.bbclass
+++ b/classes/sdcard_image-rpi.bbclass
@@ -23,12 +23,7 @@ inherit image_types
# 0 4MiB 4MiB + 48MiB 4MiB + 48Mib + SDIMG_ROOTFS
# This image depends on the rootfs image
-IMAGE_TYPEDEP_rpi-sdimg = "${SDIMG_ROOTFS_TYPE}"
-
-# Kernel image name
-SDIMG_KERNELIMAGE_raspberrypi ?= "kernel.img"
-SDIMG_KERNELIMAGE_raspberrypi2 ?= "kernel7.img"
-SDIMG_KERNELIMAGE_raspberrypi3-64 ?= "kernel8.img"
+IMAGE_TYPEDEP:rpi-sdimg = "${SDIMG_ROOTFS_TYPE}"
# Boot partition volume id
# Shorten raspberrypi to just rpi to keep it under 11 characters
@@ -65,7 +60,7 @@ do_image_rpi_sdimg[depends] = " \
do_image_rpi_sdimg[recrdeps] = "do_build"
# SD card image name
-SDIMG = "${IMGDEPLOYDIR}/${IMAGE_NAME}${IMAGE_NAME_SUFFIX}.rpi-sdimg"
+SDIMG = "${IMGDEPLOYDIR}/${IMAGE_NAME}.rpi-sdimg"
# Additional files and/or directories to be copied into the vfat partition from the IMAGE_ROOTFS.
FATPAYLOAD ?= ""
@@ -75,19 +70,7 @@ SDIMG_VFAT_DEPLOY ?= "${RPI_USE_U_BOOT}"
SDIMG_VFAT = "${IMAGE_NAME}.vfat"
SDIMG_LINK_VFAT = "${IMGDEPLOYDIR}/${IMAGE_LINK_NAME}.vfat"
-def split_overlays(d, out, ver=None):
- dts = d.getVar("KERNEL_DEVICETREE")
- # Device Tree Overlays are assumed to be suffixed by '-overlay.dtb' (4.1.x) or by '.dtbo' (4.4.9+) string and will be put in a dedicated folder
- if out:
- overlays = oe.utils.str_filter_out('\S+\-overlay\.dtb$', dts, d)
- overlays = oe.utils.str_filter_out('\S+\.dtbo$', overlays, d)
- else:
- overlays = oe.utils.str_filter('\S+\-overlay\.dtb$', dts, d) + \
- " " + oe.utils.str_filter('\S+\.dtbo$', dts, d)
-
- return overlays
-
-IMAGE_CMD_rpi-sdimg () {
+IMAGE_CMD:rpi-sdimg () {
# Align partitions
BOOT_SPACE_ALIGNED=$(expr ${BOOT_SPACE} + ${IMAGE_ROOTFS_ALIGNMENT} - 1)
@@ -97,7 +80,7 @@ IMAGE_CMD_rpi-sdimg () {
echo "Creating filesystem with Boot partition ${BOOT_SPACE_ALIGNED} KiB and RootFS $ROOTFS_SIZE KiB"
# Check if we are building with device tree support
- DTS="${KERNEL_DEVICETREE}"
+ DTS="${@make_dtb_boot_files(d)}"
# Initialize sdcard image file
dd if=/dev/zero of=${SDIMG} bs=1024 count=0 seek=${SDIMG_SIZE}
@@ -120,17 +103,20 @@ IMAGE_CMD_rpi-sdimg () {
mcopy -v -i ${WORKDIR}/boot.img -s ${DEPLOY_DIR_IMAGE}/armstubs/${ARMSTUB} ::/ || bbfatal "mcopy cannot copy ${DEPLOY_DIR_IMAGE}/armstubs/${ARMSTUB} into boot.img"
fi
if test -n "${DTS}"; then
- # Copy board device trees to root folder
- for dtbf in ${@split_overlays(d, True)}; do
- dtb=`basename $dtbf`
- mcopy -v -i ${WORKDIR}/boot.img -s ${DEPLOY_DIR_IMAGE}/$dtb ::$dtb || bbfatal "mcopy cannot copy ${DEPLOY_DIR_IMAGE}/$dtb into boot.img"
- done
-
- # Copy device tree overlays to dedicated folder
+ # Copy board device trees (including overlays)
+ # There is an assumption here - no DTB in other directories than root
+ # and root/overlays. mmd/mcopy are not very flexible tools.
mmd -i ${WORKDIR}/boot.img overlays
- for dtbf in ${@split_overlays(d, False)}; do
- dtb=`basename $dtbf`
- mcopy -v -i ${WORKDIR}/boot.img -s ${DEPLOY_DIR_IMAGE}/$dtb ::overlays/$dtb || bbfatal "mcopy cannot copy ${DEPLOY_DIR_IMAGE}/$dtb into boot.img"
+ for entry in ${DTS} ; do
+ # Split entry at optional ';'
+ if [ $(echo "$entry" | grep -c \;) = "0" ] ; then
+ DEPLOY_FILE="$entry"
+ DEST_FILENAME="$entry"
+ else
+ DEPLOY_FILE="$(echo "$entry" | cut -f1 -d\;)"
+ DEST_FILENAME="$(echo "$entry" | cut -f2- -d\;)"
+ fi
+ mcopy -v -i ${WORKDIR}/boot.img -s ${DEPLOY_DIR_IMAGE}/${DEPLOY_FILE} ::${DEST_FILENAME} || bbfatal "mcopy cannot copy ${DEPLOY_DIR_IMAGE}/${DEPLOY_FILE} into boot.img"
done
fi
if [ "${RPI_USE_U_BOOT}" = "1" ]; then
diff --git a/conf/layer.conf b/conf/layer.conf
index 2518379..9488ac9 100644
--- a/conf/layer.conf
+++ b/conf/layer.conf
@@ -9,7 +9,7 @@ BBFILE_COLLECTIONS += "raspberrypi"
BBFILE_PATTERN_raspberrypi := "^${LAYERDIR}/"
BBFILE_PRIORITY_raspberrypi = "9"
-LAYERSERIES_COMPAT_raspberrypi = "hardknott honister"
+LAYERSERIES_COMPAT_raspberrypi = "nanbield scarthgap"
LAYERDEPENDS_raspberrypi = "core"
# Additional license directories.
@@ -36,5 +36,5 @@ BBFILES_DYNAMIC += " \
multimedia-layer:${LAYERDIR}/dynamic-layers/multimedia-layer/*/*/*.bbappend \
"
-DEFAULT_TEST_SUITES_remove_rpi = "parselogs"
-DEFAULT_TEST_SUITES_append_rpi = " parselogs_rpi"
+DEFAULT_TEST_SUITES:remove:rpi = "parselogs"
+DEFAULT_TEST_SUITES:append:rpi = " parselogs_rpi"
diff --git a/conf/machine/include/rpi-base.inc b/conf/machine/include/rpi-base.inc
index a800078..a5fd1a4 100644
--- a/conf/machine/include/rpi-base.inc
+++ b/conf/machine/include/rpi-base.inc
@@ -18,24 +18,38 @@ RPI_KERNEL_DEVICETREE_OVERLAYS ?= " \
overlays/overlay_map.dtb \
overlays/at86rf233.dtbo \
overlays/disable-bt.dtbo \
+ overlays/disable-wifi.dtbo \
overlays/dwc2.dtbo \
overlays/gpio-ir.dtbo \
overlays/gpio-ir-tx.dtbo \
overlays/gpio-key.dtbo \
+ overlays/gpio-poweroff.dtbo \
+ overlays/gpio-shutdown.dtbo \
overlays/hifiberry-amp.dtbo \
+ overlays/hifiberry-amp100.dtbo \
+ overlays/hifiberry-amp3.dtbo \
+ overlays/hifiberry-amp4pro.dtbo \
overlays/hifiberry-dac.dtbo \
overlays/hifiberry-dacplus.dtbo \
+ overlays/hifiberry-dacplusadc.dtbo \
+ overlays/hifiberry-dacplusadcpro.dtbo \
+ overlays/hifiberry-dacplusdsp.dtbo \
+ overlays/hifiberry-dacplushd.dtbo \
+ overlays/hifiberry-digi-pro.dtbo \
overlays/hifiberry-digi.dtbo \
overlays/justboom-both.dtbo \
overlays/justboom-dac.dtbo \
overlays/justboom-digi.dtbo \
+ overlays/i2c-gpio.dtbo \
overlays/i2c-rtc.dtbo \
overlays/imx219.dtbo \
overlays/imx477.dtbo \
+ overlays/imx708.dtbo \
overlays/iqaudio-dac.dtbo \
overlays/iqaudio-dacplus.dtbo \
overlays/mcp2515-can0.dtbo \
overlays/mcp2515-can1.dtbo \
+ overlays/mcp3008.dtbo \
overlays/miniuart-bt.dtbo \
overlays/pitft22.dtbo \
overlays/pitft28-capacitive.dtbo \
@@ -45,28 +59,34 @@ RPI_KERNEL_DEVICETREE_OVERLAYS ?= " \
overlays/rpi-ft5406.dtbo \
overlays/rpi-poe.dtbo \
overlays/vc4-fkms-v3d.dtbo \
+ overlays/vc4-fkms-v3d-pi4.dtbo \
overlays/vc4-kms-v3d.dtbo \
overlays/vc4-kms-v3d-pi4.dtbo \
+ overlays/vc4-kms-v3d-pi5.dtbo \
overlays/vc4-kms-dsi-7inch.dtbo \
overlays/w1-gpio.dtbo \
overlays/w1-gpio-pullup.dtbo \
+ overlays/wm8960-soundcard.dtbo \
"
RPI_KERNEL_DEVICETREE ?= " \
- bcm2708-rpi-zero.dtb \
- bcm2708-rpi-zero-w.dtb \
- bcm2708-rpi-b.dtb \
- bcm2708-rpi-b-rev1.dtb \
- bcm2708-rpi-b-plus.dtb \
- bcm2709-rpi-2-b.dtb \
- bcm2710-rpi-2-b.dtb \
- bcm2710-rpi-3-b.dtb \
- bcm2710-rpi-3-b-plus.dtb \
- bcm2711-rpi-4-b.dtb \
- bcm2711-rpi-400.dtb \
- bcm2708-rpi-cm.dtb \
- bcm2710-rpi-cm3.dtb \
- bcm2711-rpi-cm4.dtb \
+ broadcom/bcm2708-rpi-zero.dtb \
+ broadcom/bcm2708-rpi-zero-w.dtb \
+ broadcom/bcm2708-rpi-b.dtb \
+ broadcom/bcm2708-rpi-b-rev1.dtb \
+ broadcom/bcm2708-rpi-b-plus.dtb \
+ broadcom/bcm2709-rpi-2-b.dtb \
+ broadcom/bcm2710-rpi-2-b.dtb \
+ broadcom/bcm2710-rpi-3-b.dtb \
+ broadcom/bcm2710-rpi-3-b-plus.dtb \
+ broadcom/bcm2710-rpi-zero-2.dtb \
+ broadcom/bcm2711-rpi-4-b.dtb \
+ broadcom/bcm2711-rpi-400.dtb \
+ broadcom/bcm2708-rpi-cm.dtb \
+ broadcom/bcm2710-rpi-cm3.dtb \
+ broadcom/bcm2711-rpi-cm4.dtb \
+ broadcom/bcm2711-rpi-cm4s.dtb \
+ broadcom/bcm2712-rpi-5-b.dtb \
"
KERNEL_DEVICETREE ??= " \
@@ -95,6 +115,7 @@ MACHINE_FEATURES_BACKFILL_CONSIDERED = "rtc"
MACHINE_EXTRA_RRECOMMENDS += "kernel-modules udev-rules-rpi"
MACHINE_ESSENTIAL_EXTRA_RRECOMMENDS += "${@oe.utils.conditional('ENABLE_I2C', '1', 'kernel-module-i2c-dev kernel-module-i2c-bcm2708', '', d)}"
MACHINE_ESSENTIAL_EXTRA_RRECOMMENDS += "${@oe.utils.conditional('ENABLE_IR', '1', 'kernel-module-gpio-ir kernel-module-gpio-ir-tx', '', d)}"
+MACHINE_ESSENTIAL_EXTRA_RRECOMMENDS += "${@oe.utils.conditional('ENABLE_GPIO_SHUTDOWN', '1', 'gpio-shutdown kernel-module-gpio-keys', '', d)}"
SERIAL_CONSOLES_CHECK ??= "${SERIAL_CONSOLES}"
@@ -102,14 +123,14 @@ SERIAL_CONSOLES_CHECK ??= "${SERIAL_CONSOLES}"
# This variable is referred to by recipes fetching / generating the files.
BOOTFILES_DIR_NAME ?= "bootfiles"
-# Set Raspberrypi splash image
-SPLASH ?= "psplash-raspberrypi"
-
def make_dtb_boot_files(d):
# Generate IMAGE_BOOT_FILES entries for device tree files listed in
# KERNEL_DEVICETREE.
alldtbs = d.getVar('KERNEL_DEVICETREE')
- imgtyp = d.getVar('KERNEL_IMAGETYPE')
+
+ # DTBs may be built out of kernel with devicetree.bbclass
+ if not alldtbs:
+ return ''
def transform(dtb):
base = os.path.basename(dtb)
@@ -127,14 +148,21 @@ def make_dtb_boot_files(d):
return ' '.join([transform(dtb) for dtb in alldtbs.split(' ') if dtb])
+RPI_EXTRA_IMAGE_BOOT_FILES ?= " \
+ ${@bb.utils.contains('RPI_USE_U_BOOT', '1', \
+ '${KERNEL_IMAGETYPE} u-boot.bin;${SDIMG_KERNELIMAGE} boot.scr', \
+ '${KERNEL_IMAGETYPE};${SDIMG_KERNELIMAGE}', d)} \
+ "
IMAGE_BOOT_FILES ?= "${BOOTFILES_DIR_NAME}/* \
${@make_dtb_boot_files(d)} \
- ${@bb.utils.contains('RPI_USE_U_BOOT', '1', \
- '${KERNEL_IMAGETYPE} u-boot.bin;${SDIMG_KERNELIMAGE} boot.scr', \
- '${KERNEL_IMAGETYPE};${SDIMG_KERNELIMAGE}', d)} \
+ ${RPI_EXTRA_IMAGE_BOOT_FILES} \
"
+
+EXTRA_IMAGEDEPENDS += "rpi-bootfiles"
+
do_image_wic[depends] += " \
+ virtual/kernel:do_deploy \
rpi-bootfiles:do_deploy \
${@bb.utils.contains('RPI_USE_U_BOOT', '1', 'u-boot:do_deploy', '',d)} \
"
@@ -143,4 +171,4 @@ do_image_wic[recrdeps] = "do_build"
# The kernel image is installed into the FAT32 boot partition and does not need
# to also be installed into the rootfs.
-RDEPENDS_${KERNEL_PACKAGE_NAME}-base = ""
+RDEPENDS:${KERNEL_PACKAGE_NAME}-base = ""
diff --git a/conf/machine/include/rpi-default-providers.inc b/conf/machine/include/rpi-default-providers.inc
index c02d248..3f81026 100644
--- a/conf/machine/include/rpi-default-providers.inc
+++ b/conf/machine/include/rpi-default-providers.inc
@@ -7,6 +7,10 @@ PREFERRED_PROVIDER_virtual/libgles2 ?= "${@bb.utils.contains("MACHINE_FEATURES",
PREFERRED_PROVIDER_virtual/libgl ?= "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "mesa", "mesa-gl", d)}"
PREFERRED_PROVIDER_virtual/mesa ?= "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "mesa", "mesa-gl", d)}"
PREFERRED_PROVIDER_virtual/libgbm ?= "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "mesa", "mesa-gl", d)}"
+PREFERRED_PROVIDER_vlc ?= "rpidistro-vlc"
+PREFERRED_PROVIDER_ffmpeg ?= "rpidistro-ffmpeg"
+PREFERRED_PROVIDER_libav ?= "rpidistro-ffmpeg"
+PREFERRED_PROVIDER_libpostproc ?= "rpidistro-ffmpeg"
PREFERRED_PROVIDER_jpeg ?= "jpeg"
PREFERRED_PROVIDER_virtual/libomxil ?= "userland"
diff --git a/conf/machine/include/rpi-default-settings.inc b/conf/machine/include/rpi-default-settings.inc
index bb18496..b788f14 100644
--- a/conf/machine/include/rpi-default-settings.inc
+++ b/conf/machine/include/rpi-default-settings.inc
@@ -5,5 +5,5 @@ IMAGE_CLASSES += "sdcard_image-rpi"
# RPI kernel has errors of its own which should be filtered
# therefore use parselogs_rpi test instead of parselogs from oe-core
#
-DEFAULT_TEST_SUITES_append_rpi = " parselogs_rpi"
-DEFAULT_TEST_SUITES_remove_rpi = " parselogs"
+DEFAULT_TEST_SUITES:append:rpi = " parselogs_rpi"
+DEFAULT_TEST_SUITES:remove:rpi = " parselogs"
diff --git a/conf/machine/include/rpi-default-versions.inc b/conf/machine/include/rpi-default-versions.inc
index 17d5bd6..6def274 100644
--- a/conf/machine/include/rpi-default-versions.inc
+++ b/conf/machine/include/rpi-default-versions.inc
@@ -1,3 +1,4 @@
# RaspberryPi BSP default versions
-PREFERRED_VERSION_linux-raspberrypi ??= "5.10.%"
+PREFERRED_VERSION_linux-raspberrypi ??= "6.6.%"
+PREFERRED_VERSION_linux-raspberrypi-v7 ??= "${PREFERRED_VERSION_linux-raspberrypi}"
diff --git a/conf/machine/include/tune-arm1176jzf-s.inc b/conf/machine/include/tune-arm1176jzf-s.inc
index b9e0377..ce5f08e 100644
--- a/conf/machine/include/tune-arm1176jzf-s.inc
+++ b/conf/machine/include/tune-arm1176jzf-s.inc
@@ -7,11 +7,11 @@ TUNE_CCARGS .= "${@bb.utils.contains('TUNE_FEATURES', 'arm1176jzfs', ' -mcpu=arm
MACHINEOVERRIDES =. "${@bb.utils.contains('TUNE_FEATURES', 'arm1176jzfs', 'armv6:', '', d)}"
AVAILTUNES += "arm1176jzfs"
-ARMPKGARCH_tune-arm1176jzfs = "arm1176jzfs"
-TUNE_FEATURES_tune-arm1176jzfs = "arm thumb vfp arm1176jzfs"
-PACKAGE_EXTRA_ARCHS_tune-arm1176jzfs = "${PACKAGE_EXTRA_ARCHS_tune-armv6} arm1176jzfs"
+ARMPKGARCH:tune-arm1176jzfs = "arm1176jzfs"
+TUNE_FEATURES:tune-arm1176jzfs = "arm thumb vfp arm1176jzfs"
+PACKAGE_EXTRA_ARCHS:tune-arm1176jzfs = "${PACKAGE_EXTRA_ARCHS:tune-armv6} arm1176jzfs"
AVAILTUNES += "arm1176jzfshf"
-ARMPKGARCH_tune-arm1176jzfshf = "${ARMPKGARCH_tune-arm1176jzfs}"
-TUNE_FEATURES_tune-arm1176jzfshf = "${TUNE_FEATURES_tune-arm1176jzfs} callconvention-hard"
-PACKAGE_EXTRA_ARCHS_tune-arm1176jzfshf = "${PACKAGE_EXTRA_ARCHS_tune-armv6thf} arm1176jzfshf-vfp"
+ARMPKGARCH:tune-arm1176jzfshf = "${ARMPKGARCH:tune-arm1176jzfs}"
+TUNE_FEATURES:tune-arm1176jzfshf = "${TUNE_FEATURES:tune-arm1176jzfs} callconvention-hard"
+PACKAGE_EXTRA_ARCHS:tune-arm1176jzfshf = "${PACKAGE_EXTRA_ARCHS:tune-armv6thf} arm1176jzfshf-vfp"
diff --git a/conf/machine/raspberrypi-armv7.conf b/conf/machine/raspberrypi-armv7.conf
new file mode 100644
index 0000000..cb2e5a2
--- /dev/null
+++ b/conf/machine/raspberrypi-armv7.conf
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+#@TYPE: Machine
+#@NAME: RaspberryPi Development Boards (32bit)
+#@DESCRIPTION: Machine configuration for the RaspberryPi boards in 32 bit mode
+
+DEFAULTTUNE ?= "cortexa7thf-neon-vfpv4"
+require conf/machine/include/arm/armv7a/tune-cortexa7.inc
+include conf/machine/include/rpi-base.inc
+
+# This machine includes by default the kernel for v7l. We hook in support for
+# v7.
+RASPBERRYPI_v7_KERNEL = "linux-raspberrypi-v7"
+RASPBERRYPI_v7_KERNEL_PACKAGE_NAME = "kernel-v7"
+RASPBERRYPI_v7_KERNEL_FILE ?= "kernel7.img"
+# We don't need a lot for v7l because it is the default provider,
+# virtual/kernel.
+RASPBERRYPI_v7l_KERNEL_FILE ?= "kernel7l.img"
+
+MACHINE_FEATURES += "pci"
+MACHINE_EXTRA_RRECOMMENDS += "\
+ linux-firmware-rpidistro-bcm43430 \
+ linux-firmware-rpidistro-bcm43436 \
+ linux-firmware-rpidistro-bcm43436s \
+ linux-firmware-rpidistro-bcm43455 \
+ linux-firmware-rpidistro-bcm43456 \
+ bluez-firmware-rpidistro-bcm43430a1-hcd \
+ bluez-firmware-rpidistro-bcm43430b0-hcd \
+ bluez-firmware-rpidistro-bcm4345c0-hcd \
+ bluez-firmware-rpidistro-bcm4345c5-hcd \
+"
+
+# FIXME: This machine doesn't support u-boot (yet)
+RPI_EXTRA_IMAGE_BOOT_FILES = " \
+ ${KERNEL_IMAGETYPE};${RASPBERRYPI_v7l_KERNEL_FILE} \
+ ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}/${KERNEL_IMAGETYPE};${RASPBERRYPI_v7_KERNEL_FILE} \
+"
diff --git a/conf/machine/raspberrypi-armv8.conf b/conf/machine/raspberrypi-armv8.conf
new file mode 100644
index 0000000..0128bdc
--- /dev/null
+++ b/conf/machine/raspberrypi-armv8.conf
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+#@TYPE: Machine
+#@NAME: RaspberryPi Development Boards (64bit)
+#@DESCRIPTION: Machine configuration for the RaspberryPi boards in 64 bit mode
+
+require conf/machine/include/arm/armv8a/tune-cortexa53.inc
+include conf/machine/include/rpi-base.inc
+
+MACHINE_FEATURES += "pci"
+MACHINE_EXTRA_RRECOMMENDS += "\
+ linux-firmware-rpidistro-bcm43430 \
+ linux-firmware-rpidistro-bcm43455 \
+ linux-firmware-rpidistro-bcm43456 \
+ linux-firmware-rpidistro-bcm43436 \
+ linux-firmware-rpidistro-bcm43436s \
+ bluez-firmware-rpidistro-bcm43430a1-hcd \
+ bluez-firmware-rpidistro-bcm43430b0-hcd \
+ bluez-firmware-rpidistro-bcm4345c0-hcd \
+ bluez-firmware-rpidistro-bcm4345c5-hcd \
+"
+
+RPI_KERNEL_DEVICETREE = " \
+ broadcom/bcm2710-rpi-3-b.dtb \
+ broadcom/bcm2710-rpi-3-b-plus.dtb \
+ broadcom/bcm2837-rpi-3-b.dtb \
+ broadcom/bcm2710-rpi-cm3.dtb \
+ broadcom/bcm2710-rpi-zero-2.dtb \
+ broadcom/bcm2711-rpi-4-b.dtb \
+ broadcom/bcm2711-rpi-400.dtb \
+ broadcom/bcm2711-rpi-cm4.dtb \
+ broadcom/bcm2711-rpi-cm4s.dtb \
+ broadcom/bcm2712-rpi-5-b.dtb \
+"
+
+SDIMG_KERNELIMAGE ?= "kernel8.img"
+KERNEL_IMAGETYPE_UBOOT ?= "Image"
+KERNEL_IMAGETYPE_DIRECT ?= "Image"
+KERNEL_BOOTCMD ?= "booti"
+UBOOT_MACHINE = "rpi_arm64_config"
+SERIAL_CONSOLES ?= "115200;ttyS0"
+
+VC4DTBO ?= "vc4-fkms-v3d"
diff --git a/conf/machine/raspberrypi-cm.conf b/conf/machine/raspberrypi-cm.conf
index f9371df..365d030 100644
--- a/conf/machine/raspberrypi-cm.conf
+++ b/conf/machine/raspberrypi-cm.conf
@@ -2,7 +2,7 @@
#@NAME: RaspberryPi Compute Module (CM1)
#@DESCRIPTION: Machine configuration for the RaspberryPi Compute Module (CM1)
-MACHINEOVERRIDES = "raspberrypi:${MACHINE}"
+MACHINEOVERRIDES =. "raspberrypi:"
include conf/machine/raspberrypi.conf
ARMSTUB ?= "armstub.bin"
diff --git a/conf/machine/raspberrypi-cm3.conf b/conf/machine/raspberrypi-cm3.conf
index f1b8151..2ffdfaf 100644
--- a/conf/machine/raspberrypi-cm3.conf
+++ b/conf/machine/raspberrypi-cm3.conf
@@ -3,7 +3,7 @@
#@DESCRIPTION: Machine configuration for the RaspberryPi Compute Module 3 (CM3)
DEFAULTTUNE ?= "cortexa7thf-neon-vfpv4"
-require conf/machine/include/tune-cortexa7.inc
+require conf/machine/include/arm/armv7a/tune-cortexa7.inc
include conf/machine/include/rpi-base.inc
SDIMG_KERNELIMAGE ?= "kernel7.img"
diff --git a/conf/machine/raspberrypi.conf b/conf/machine/raspberrypi.conf
index b23687b..05263d7 100644
--- a/conf/machine/raspberrypi.conf
+++ b/conf/machine/raspberrypi.conf
@@ -7,8 +7,8 @@ DEFAULTTUNE ?= "arm1176jzfshf"
require conf/machine/include/tune-arm1176jzf-s.inc
include conf/machine/include/rpi-base.inc
-SERIAL_CONSOLES ?= "115200;ttyAMA0"
-
+SDIMG_KERNELIMAGE ?= "kernel.img"
UBOOT_MACHINE = "rpi_config"
+SERIAL_CONSOLES ?= "115200;ttyAMA0"
ARMSTUB ?= "armstub.bin"
diff --git a/conf/machine/raspberrypi0-2w-64.conf b/conf/machine/raspberrypi0-2w-64.conf
new file mode 100644
index 0000000..0264107
--- /dev/null
+++ b/conf/machine/raspberrypi0-2w-64.conf
@@ -0,0 +1,17 @@
+#@TYPE: Machine
+#@NAME: RaspberryPi0 2 Wifi Development Board
+#@DESCRIPTION: Machine configuration for the RaspberryPi0 2 Wifi in 64 bits mode
+
+MACHINEOVERRIDES =. "raspberrypi3-64:"
+
+include conf/machine/raspberrypi3-64.conf
+
+MACHINE_EXTRA_RRECOMMENDS += "\
+ linux-firmware-rpidistro-bcm43436 \
+ linux-firmware-rpidistro-bcm43436s \
+ bluez-firmware-rpidistro-bcm43430b0-hcd \
+"
+
+RPI_KERNEL_DEVICETREE = " \
+ broadcom/bcm2710-rpi-zero-2.dtb \
+ "
diff --git a/conf/machine/raspberrypi0-2w.conf b/conf/machine/raspberrypi0-2w.conf
new file mode 100644
index 0000000..f3a4c4d
--- /dev/null
+++ b/conf/machine/raspberrypi0-2w.conf
@@ -0,0 +1,13 @@
+#@TYPE: Machine
+#@NAME: RaspberryPi0 2 Wifi Development Board
+#@DESCRIPTION: Machine configuration for the RaspberryPi0 2 Wifi in 32 bits mode
+
+MACHINEOVERRIDES =. "raspberrypi3:"
+
+include conf/machine/raspberrypi3.conf
+
+MACHINE_EXTRA_RRECOMMENDS += "\
+ linux-firmware-rpidistro-bcm43436 \
+ linux-firmware-rpidistro-bcm43436s \
+ bluez-firmware-rpidistro-bcm43430b0-hcd \
+"
diff --git a/conf/machine/raspberrypi0.conf b/conf/machine/raspberrypi0.conf
index 80297b5..597918a 100644
--- a/conf/machine/raspberrypi0.conf
+++ b/conf/machine/raspberrypi0.conf
@@ -2,7 +2,7 @@
#@NAME: RaspberryPi Zero Development Board
#@DESCRIPTION: Machine configuration for the RaspberryPi Zero board (https://www.raspberrypi.org/blog/raspberry-pi-zero)
-MACHINEOVERRIDES = "raspberrypi:${MACHINE}"
+MACHINEOVERRIDES =. "raspberrypi:"
include conf/machine/raspberrypi.conf
SERIAL_CONSOLES ?= "115200;ttyAMA0"
diff --git a/conf/machine/raspberrypi2.conf b/conf/machine/raspberrypi2.conf
index 505c6f2..8cb859e 100644
--- a/conf/machine/raspberrypi2.conf
+++ b/conf/machine/raspberrypi2.conf
@@ -4,11 +4,11 @@
DEFAULTTUNE ?= "cortexa7thf-neon-vfpv4"
-require conf/machine/include/tune-cortexa7.inc
+require conf/machine/include/arm/armv7a/tune-cortexa7.inc
include conf/machine/include/rpi-base.inc
+SDIMG_KERNELIMAGE ?= "kernel7.img"
SERIAL_CONSOLES ?= "115200;ttyAMA0"
-
UBOOT_MACHINE = "rpi_2_config"
ARMSTUB ?= "armstub7.bin"
diff --git a/conf/machine/raspberrypi3-64.conf b/conf/machine/raspberrypi3-64.conf
index 5394132..50dd533 100644
--- a/conf/machine/raspberrypi3-64.conf
+++ b/conf/machine/raspberrypi3-64.conf
@@ -2,7 +2,7 @@
#@NAME: RaspberryPi 3 Development Board
#@DESCRIPTION: Machine configuration for the RaspberryPi 3 in 64 bits mode
-MACHINEOVERRIDES = "raspberrypi3:${MACHINE}"
+MACHINEOVERRIDES =. "raspberrypi3:"
MACHINE_EXTRA_RRECOMMENDS += "\
linux-firmware-rpidistro-bcm43430 \
@@ -11,7 +11,7 @@ MACHINE_EXTRA_RRECOMMENDS += "\
bluez-firmware-rpidistro-bcm4345c0-hcd \
"
-require conf/machine/include/tune-cortexa53.inc
+require conf/machine/include/arm/armv8a/tune-cortexa53.inc
include conf/machine/include/rpi-base.inc
RPI_KERNEL_DEVICETREE = " \
@@ -21,16 +21,15 @@ RPI_KERNEL_DEVICETREE = " \
broadcom/bcm2710-rpi-cm3.dtb \
"
-SERIAL_CONSOLES ?= "115200;ttyS0"
-
-UBOOT_MACHINE = "rpi_arm64_config"
-
+SDIMG_KERNELIMAGE ?= "kernel8.img"
# When u-boot is enabled we need to use the "Image" format and the "booti"
# command to load the kernel
KERNEL_IMAGETYPE_UBOOT ?= "Image"
# "zImage" not supported on arm64 and ".gz" images not supported by bootloader yet
KERNEL_IMAGETYPE_DIRECT ?= "Image"
KERNEL_BOOTCMD ?= "booti"
+UBOOT_MACHINE = "rpi_arm64_config"
+SERIAL_CONSOLES ?= "115200;ttyS0"
VC4DTBO ?= "vc4-fkms-v3d"
ARMSTUB ?= "armstub8.bin"
diff --git a/conf/machine/raspberrypi3.conf b/conf/machine/raspberrypi3.conf
index dafb66e..1212498 100644
--- a/conf/machine/raspberrypi3.conf
+++ b/conf/machine/raspberrypi3.conf
@@ -3,7 +3,7 @@
#@DESCRIPTION: Machine configuration for the RaspberryPi 3 in 32 bits mode
DEFAULTTUNE ?= "cortexa7thf-neon-vfpv4"
-require conf/machine/include/tune-cortexa7.inc
+require conf/machine/include/arm/armv7a/tune-cortexa7.inc
include conf/machine/include/rpi-base.inc
MACHINE_EXTRA_RRECOMMENDS += "\
diff --git a/conf/machine/raspberrypi4-64.conf b/conf/machine/raspberrypi4-64.conf
index 12c8954..42ed4be 100644
--- a/conf/machine/raspberrypi4-64.conf
+++ b/conf/machine/raspberrypi4-64.conf
@@ -2,21 +2,24 @@
#@NAME: RaspberryPi 4 Development Board (64bit)
#@DESCRIPTION: Machine configuration for the RaspberryPi 4 in 64 bits mode
-MACHINEOVERRIDES = "raspberrypi4:${MACHINE}"
+MACHINEOVERRIDES =. "raspberrypi4:"
MACHINE_FEATURES += "pci"
MACHINE_EXTRA_RRECOMMENDS += "\
linux-firmware-rpidistro-bcm43455 \
bluez-firmware-rpidistro-bcm4345c0-hcd \
+ linux-firmware-rpidistro-bcm43456 \
+ bluez-firmware-rpidistro-bcm4345c5-hcd \
"
-require conf/machine/include/tune-cortexa72.inc
+require conf/machine/include/arm/armv8a/tune-cortexa72.inc
include conf/machine/include/rpi-base.inc
RPI_KERNEL_DEVICETREE = " \
broadcom/bcm2711-rpi-4-b.dtb \
broadcom/bcm2711-rpi-400.dtb \
broadcom/bcm2711-rpi-cm4.dtb \
+ broadcom/bcm2711-rpi-cm4s.dtb \
"
SDIMG_KERNELIMAGE ?= "kernel8.img"
@@ -24,7 +27,7 @@ SERIAL_CONSOLES ?= "115200;ttyS0"
UBOOT_MACHINE = "rpi_arm64_config"
-VC4DTBO ?= "vc4-fkms-v3d"
+VC4DTBO ?= "vc4-kms-v3d"
# When u-boot is enabled we need to use the "Image" format and the "booti"
# command to load the kernel
diff --git a/conf/machine/raspberrypi4.conf b/conf/machine/raspberrypi4.conf
index d6b1d1b..86c57ed 100644
--- a/conf/machine/raspberrypi4.conf
+++ b/conf/machine/raspberrypi4.conf
@@ -3,13 +3,15 @@
#@DESCRIPTION: Machine configuration for the RaspberryPi 4 in 32 bit mode
DEFAULTTUNE ?= "cortexa7thf-neon-vfpv4"
-require conf/machine/include/tune-cortexa7.inc
+require conf/machine/include/arm/armv7a/tune-cortexa7.inc
include conf/machine/include/rpi-base.inc
MACHINE_FEATURES += "pci"
MACHINE_EXTRA_RRECOMMENDS += "\
linux-firmware-rpidistro-bcm43455 \
bluez-firmware-rpidistro-bcm4345c0-hcd \
+ linux-firmware-rpidistro-bcm43456 \
+ bluez-firmware-rpidistro-bcm4345c5-hcd \
"
# 'l' stands for LPAE
@@ -17,5 +19,5 @@ SDIMG_KERNELIMAGE ?= "kernel7l.img"
UBOOT_MACHINE = "rpi_4_32b_config"
SERIAL_CONSOLES ?= "115200;ttyS0"
-VC4DTBO ?= "vc4-fkms-v3d"
+VC4DTBO ?= "vc4-kms-v3d"
ARMSTUB ?= "armstub7.bin"
diff --git a/conf/machine/raspberrypi5.conf b/conf/machine/raspberrypi5.conf
new file mode 100644
index 0000000..8c38637
--- /dev/null
+++ b/conf/machine/raspberrypi5.conf
@@ -0,0 +1,26 @@
+#@TYPE: Machine
+#@NAME: RaspberryPi 5 Development Board (64bit)
+#@DESCRIPTION: Machine configuration for the RaspberryPi 5 in 64 bits mode
+
+require conf/machine/include/arm/armv8-2a/tune-cortexa76.inc
+include conf/machine/include/rpi-base.inc
+
+MACHINE_FEATURES += "pci"
+MACHINE_EXTRA_RRECOMMENDS += "\
+ linux-firmware-rpidistro-bcm43455 \
+ bluez-firmware-rpidistro-bcm4345c0-hcd \
+ linux-firmware-rpidistro-bcm43456 \
+ bluez-firmware-rpidistro-bcm4345c5-hcd \
+"
+
+RPI_KERNEL_DEVICETREE = " \
+ broadcom/bcm2712-rpi-5-b.dtb \
+"
+
+SDIMG_KERNELIMAGE ?= "kernel_2712.img"
+SERIAL_CONSOLES ?= "115200;ttyAMA10"
+
+VC4DTBO ?= "vc4-kms-v3d"
+
+# "zImage" not supported on arm64 and ".gz" images not supported by bootloader yet
+KERNEL_IMAGETYPE_DIRECT ?= "Image"
diff --git a/docs/conf.py b/docs/conf.py
index fcebbf1..39e7223 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -30,7 +30,10 @@
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
-extensions = []
+extensions = [
+ 'myst_parser',
+ 'sphinx_rtd_theme'
+]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
@@ -50,7 +53,7 @@ master_doc = 'index'
# General information about the project.
project = 'meta-raspberrypi'
-copyright = '2017, meta-raspberrypi contributors'
+copyright = '2022, meta-raspberrypi contributors'
author = 'meta-raspberrypi contributors'
# The version info for the project you're documenting, acts as replacement for
@@ -121,6 +124,7 @@ todo_include_todos = False
# a list of builtin themes.
#
# html_theme = 'alabaster'
+html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
@@ -154,7 +158,7 @@ todo_include_todos = False
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+#html_static_path = ['_static']
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
@@ -337,7 +341,3 @@ texinfo_documents = [
# If true, do not generate a @detailmenu in the "Top" node's menu.
#
# texinfo_no_detailmenu = False
-
-source_parsers = {
- '.md': 'recommonmark.parser.CommonMarkParser',
-}
diff --git a/docs/contributing.md b/docs/contributing.md
index 7d18400..9ecb6cb 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -4,8 +4,8 @@
The main communication tool in use is the Yocto Project mailing list:
-* <yocto@yoctoproject.org>
-* <https://lists.yoctoproject.org/listinfo/yocto>
+* <yocto@lists.yoctoproject.org>
+* <https://lists.yoctoproject.org/g/yocto>
Feel free to ask any kind of questions but please always prepend your email
subject with `[meta-raspberrypi]` as this is the global *Yocto* mailing
@@ -84,7 +84,11 @@ sure to use the following formatting for the message subject:
Then, for sending patches to the mailing list, you may use this command:
- git send-email --to yocto@yoctoproject.org <generated patch>
+ git send-email --to yocto@lists.yoctoproject.org <generated patch>
+
+When patches are sent through the mailing list, the maintainer will include
+them in a GitHub pull request that will take the patches through the CI
+workflows. This process happens periodically.
## GitHub issues
diff --git a/docs/extra-apps.md b/docs/extra-apps.md
index 53007dc..dbbfb8d 100644
--- a/docs/extra-apps.md
+++ b/docs/extra-apps.md
@@ -6,4 +6,4 @@ omxplayer depends on libav which has a commercial license. So in order to be
able to compile omxplayer you will need to whiteflag the commercial
license in your local.conf:
- LICENSE_FLAGS_WHITELIST = "commercial"
+ LICENSE_FLAGS_ACCEPTED = "commercial"
diff --git a/docs/extra-build-config.md b/docs/extra-build-config.md
index 1caf904..dfb86f1 100644
--- a/docs/extra-build-config.md
+++ b/docs/extra-build-config.md
@@ -29,7 +29,7 @@ Accommodate the values above to your own needs (ex: ext3 / ext4).
* `GPU_MEM_1024`: GPU memory in megabyte for the 1024MB Raspberry Pi. Ignored by
the 256MB/512MB RP. Overrides gpu_mem. Max 944. Default not set.
-See: <https://www.raspberrypi.org/documentation/configuration/config-txt/memory.md>
+See: <https://www.raspberrypi.com/documentation/computers/config_txt.html#memory-options>
## VC4
@@ -47,7 +47,7 @@ You can supply more licenses separated by comma. Example:
KEY_DECODE_WVC1 = "0x12345678,0xabcdabcd,0x87654321"
-See: <https://www.raspberrypi.org/documentation/configuration/config-txt/codeclicence.md>
+See: <https://www.raspberrypi.com/documentation/computers/config_txt.html#licence-key-and-codec-options>
## Disable overscan
@@ -74,6 +74,22 @@ To remove (or adjust) this delay set these variables in local.conf:
BOOT_DELAY = "0"
BOOT_DELAY_MS = "0"
+## Boot media
+
+The Raspberry Pi 4 board can load the boot image files from SD card and USB memory.
+By default SD card media is used as boot media.
+
+To switch the boot media from SD card to USB memory, the following variables are supported
+in local.conf: `CMDLINE_ROOT_PARTITION` and `BOOT_MEDIA`.
+The default value of `CMDLINE_ROOT_PARTITION` is "/dev/mmcblk0p2" to mount SD card. If you want to mount USB memory partition, set CMDLINE_ROOT_PARTITION to "/dev/sda2".
+`BOOT_MEDIA` allows `mmc` and `usb`. The "mmc" is required to load an image from the SD card, following the u-boot specification. Similarly, if you want to load a boot image file from USB memory, set BOOT_MEDIA to "usb".
+
+For example, if you want to use USB boot, please define
+the following parameters in your local.conf file.
+
+ CMDLINE_ROOT_PARTITION = "/dev/sda2"
+ BOOT_MEDIA = "usb"
+
## Set overclocking options
The Raspberry Pi can be overclocked. As of now overclocking up to the "Turbo
@@ -89,7 +105,7 @@ Example official settings for Turbo Mode in Raspberry Pi 2:
SDRAM_FREQ = "500"
OVER_VOLTAGE = "6"
-See: <https://www.raspberrypi.org/documentation/configuration/config-txt/overclocking.md>
+See: <https://www.raspberrypi.com/documentation/computers/config_txt.html#overclocking-options>
## HDMI and composite video options
@@ -99,14 +115,14 @@ selected according to the connected monitor's EDID information and the composite
mode is defaulted to NTSC using a 4:3 aspect ratio. Check the config.txt for a
detailed description of options and modes. The following variables are supported in
local.conf: `HDMI_FORCE_HOTPLUG`, `HDMI_DRIVE`, `HDMI_GROUP`, `HDMI_MODE`,
-`CONFIG_HDMI_BOOST`, `SDTV_MODE`, `SDTV_ASPECT` and `DISPLAY_ROTATE`.
+`HDMI_CVT`, `CONFIG_HDMI_BOOST`, `SDTV_MODE`, `SDTV_ASPECT` and `DISPLAY_ROTATE`.
Example to force HDMI output to 720p in CEA mode:
HDMI_GROUP = "1"
HDMI_MODE = "4"
-See: <https://www.raspberrypi.org/documentation/configuration/config-txt/video.md>
+See: <https://www.raspberrypi.com/documentation/computers/configuration.html#hdmi-configuration>
## Video camera support with V4L2 drivers
@@ -155,11 +171,21 @@ For further customisation the KERNEL_IMAGETYPE and KERNEL_BOOTCMD variables can
be overridden to select the exact kernel image type (eg. zImage) and u-boot
command (eg. bootz) to be used.
+To operate correctly, U-Boot requires `enable_uart=1` in `config.txt` file for
+the following boards:
+* Raspberry Pi Zero W
+* Raspberry Pi 3 32-bit
+* Raspberry Pi 3 64-bit
+* Raspberry Pi 4 32-bit
+* Raspberry Pi 4 64-bit
+It means that, for those boards, `RPI_USE_U_BOOT = "1"` is not compatible with
+`ENABLE_UART = "0"`.
+
## Image with Initramfs
To build an initramfs image:
-* Set this 3 kernel variables (in kernel's do_configure_prepend in linux-raspberrypi.inc after the line kernel_configure_variable LOCALVERSION "\"\""
+* Set this 3 kernel variables (in kernel's do_configure:prepend in linux-raspberrypi.inc after the line kernel_configure_variable LOCALVERSION "\"\""
)
- kernel_configure_variable BLK_DEV_INITRD y
- kernel_configure_variable INITRAMFS_SOURCE ""
@@ -170,7 +196,7 @@ To build an initramfs image:
- `INITRAMFS_IMAGE_BUNDLE = "1"`
- `BOOT_SPACE = "1073741"`
- `INITRAMFS_MAXSIZE = "315400"`
- - `IMAGE_FSTYPES_pn-${INITRAMFS_IMAGE} = "${INITRAMFS_FSTYPES}"`
+ - `IMAGE_FSTYPES:pn-${INITRAMFS_IMAGE} = "${INITRAMFS_FSTYPES}"`
## Including additional files in the SD card image boot partition
@@ -195,7 +221,7 @@ by tasks that image building task must depend upon, to ensure that the
files are available when they are needed, so these component deploy
tasks must be added to: RPI_SDIMG_EXTRA_DEPENDS.
- RPI_SDIMG_EXTRA_DEPENDS_append = " example:do_deploy"
+ RPI_SDIMG_EXTRA_DEPENDS:append = " example:do_deploy"
## Enable SPI bus
@@ -211,7 +237,7 @@ When using device tree kernels, set this variable to enable I2C:
Furthermore, to auto-load I2C kernel modules set:
- KERNEL_MODULE_AUTOLOAD_rpi += "i2c-dev i2c-bcm2708"
+ KERNEL_MODULE_AUTOLOAD:rpi += "i2c-dev i2c-bcm2708"
## Enable PiTFT support
@@ -278,6 +304,18 @@ the header extension should set the following in local.conf:
ENABLE_DWC2_HOST = "1"
+## Set CPUs to be isolated from the standard Linux scheduler
+
+By default Linux will use all available CPUs for scheduling tasks. For real time
+purposes there can be an advantage to isolating one or more CPUs from the
+standard scheduler. It should be noted that CPU 0 is special, it is the only CPU
+available during the early stages of the boot process and cannot be isolated.
+
+The string assigned to this variable may be a single CPU number, a comma
+separated list ("1,2"), a range("1-3"), or a mixture of these ("1,3-5")
+
+ ISOLATED_CPUS = "1-2"
+
## Enable Openlabs 802.15.4 radio module
When using device tree kernels, set this variable to enable the 802.15.4 hat:
@@ -290,18 +328,25 @@ See: <https://openlabs.co/OSHW/Raspberry-Pi-802.15.4-radio>
In order to use CAN with an MCP2515-based module, set the following variables:
- ENABLE_SPI_BUS = "1"
- ENABLE_CAN = "1"
+ ENABLE_SPI_BUS = "1"
+ ENABLE_CAN = "1"
In case of dual CAN module (e.g. PiCAN2 Duo), set following variables instead:
ENABLE_SPI_BUS = "1"
- ENABLE_DUAL_CAN = "1"
+ ENABLE_DUAL_CAN = "1"
Some modules may require setting the frequency of the crystal oscillator used on the particular board. The frequency is usually marked on the package of the crystal. By default, it is set to 16 MHz. To change that to 8 MHz, the following variable also has to be set:
CAN_OSCILLATOR="8000000"
+Configure the interrupt pin to the one connected to the CAN module. By default,
+the pins are set to 25 for can0 and 24 for can1. To change them to 12 and 16,
+the following variables also have to be set:
+
+ CAN0_INTERRUPT_PIN = "12"
+ CAN1_INTERRUPT_PIN = "16"
+
Tested modules:
* PiCAN2 (16 MHz crystal): <http://skpang.co.uk/catalog/pican2-canbus-board-for-raspberry-pi-23-p-1475.html>
@@ -320,6 +365,38 @@ Appropriate kernel modules will be also included in the image. By default the
GPIO pin for gpio-ir is set to 18 and the pin for gpio-ir-tx is 17. Both pins
can be easily changed by modifying variables `GPIO_IR` and `GPIO_IR_TX`.
+## Enable gpio-shutdown
+
+When using device tree kernels, set this variable to enable gpio-shutdown:
+
+ ENABLE_GPIO_SHUTDOWN = "1"
+
+This will add the corresponding device tree overlay to config.txt and include
+the gpio-keys kernel module in the image. If System V init is used, additional
+mapping is applied to bind the button event to shutdown command. Systemd init
+should handle the event out of the box.
+
+By default the feature uses gpio pin 3 (except RPi 1 Model B rev 1 enumerates
+the pin as gpio 1). This conflicts with the I2C bus. If you set `ENABLE_I2C`
+to `1` or enabled `PiTFT` support, or otherwise want to use another pin, use
+`GPIO_SHUTDOWN_PIN` to assign another pin. Example using gpio pin 25:
+
+ GPIO_SHUTDOWN_PIN = "25"
+
+## Enable One-Wire Interface
+
+One-wire is a single-wire communication bus typically used to connect sensors
+to the RaspberryPi. The Raspberry Pi supports one-wire on any GPIO pin, but
+the default is GPIO 4. To enable the one-wire interface explicitly set it in
+`local.conf`
+
+ ENABLE_W1 = "1"
+
+Once discovery is complete you can list the devices that your Raspberry Pi has
+discovered via all 1-Wire busses check the interface with this command
+
+`ls /sys/bus/w1/devices/`
+
## Manual additions to config.txt
The `RPI_EXTRA_CONFIG` variable can be used to manually add additional lines to
@@ -335,17 +412,93 @@ option:
# Raspberry Pi 7\" display/touch screen \n \
lcd_rotate=2 \n \
'
-## Enable Raspberrypi Camera V2
+## Enable Raspberry Pi Camera Module
-RaspberryPi does not have the unicam device ( RaspberryPi Camera ) enabled by default.
+Raspberry Pi does not have the unicam device ( Raspberry Pi Camera ) enabled by default.
Because this unicam device ( bcm2835-unicam ) as of now is used by libcamera opensource.
-So we have to explicitly set in local.conf.
+So we have to explicitly enable it in local.conf.
RASPBERRYPI_CAMERA_V2 = "1"
-This will add the device tree overlays imx219 ( RaspberryPi Camera sensor V2 driver ) to config.txt.
-Also, this will enable adding Contiguous Memory Allocation value in the cmdline.txt.
+This will add the device tree overlay imx219 ( Raspberry Pi Camera Module V2 sensor driver
+) to config.txt. Also, this will enable adding Contiguous Memory Allocation value in the
+cmdline.txt.
-Ref.:
-* <https://github.com/raspberrypi/documentation/blob/master/linux/software/libcamera/README.md>
+Similarly, the Raspberry Pi Camera Module v3 also has to be explicitly enabled in local.conf.
+
+ RASPBERRYPI_CAMERA_V3 = "1"
+
+This will add the device tree overlay imx708 ( Raspberry Pi Camera Module V3 sensor driver )
+to config.txt.
+
+See:
+* <https://www.raspberrypi.com/documentation/computers/camera_software.html>
* <https://www.raspberrypi.org/blog/an-open-source-camera-stack-for-raspberry-pi-using-libcamera/>
+
+## WM8960 soundcard support
+
+Support for WM8960 based sound cards such as the WM8960 Hi-Fi Sound Card HAT for Raspberry Pi from Waveshare, and ReSpeaker 2 / 4 / 6 Mics Pi HAT from Seeed Studio, can be enabled in `local.conf`
+
+ MACHINE_FEATURES += "wm8960"
+
+You may need to adjust volume and toggle switches that are off by default
+
+ amixer -c1 sset 'Headphone',0 80%,80%
+ amixer -c1 sset 'Speaker',0 80%,80%
+ amixer -c1 sset 'Left Input Mixer Boost' toggle
+ amixer -c1 sset 'Left Output Mixer PCM' toggle
+ amixer -c1 sset 'Right Input Mixer Boost' toggle
+ amixer -c1 sset 'Right Output Mixer PCM' toggle
+
+Audio capture on ReSpeaker 2 / 4 / 6 Mics Pi HAT from Seeed Studio is very noisy.
+
+## Support for RTC devices
+
+The RaspberryPi boards don't feature an RTC module and the machine
+configurations provided in this BSP layer have this assumption (until, if at
+all, some later boards will come with one).
+
+`rtc` is handled as a `MACHINE_FEATURES` in the context of the build system
+which means that if an attached device is provided for which support is needed,
+the recommended way forward is to write a new machine configuration based on an
+existing one. Check the documentation for
+`MACHINE_FEATURES_BACKFILL_CONSIDERED` for how this is disabled for the
+relevant machines.
+
+Even when `MACHINE_FEATURES` is tweaked to include the needed `rtc` string,
+make sure that your kernel configuration is supporting the attached device and
+the device tree is properly tweaked. Also, mind the runtime components that
+take advantage of your RTC device. You can do that by checking what is
+included/configured in the build system based on the inclusion of `rtc` in
+`MACHINE_FEATURES`.
+
+## Raspberry Pi Distro VLC
+
+To enable Raspberry Pi Distro VLC, the `meta-openembedded/meta-multimedia` layer must be
+included in your `bblayers.conf`.
+
+VLC does not support HW accelerated video decode through MMAL on a 64-bit OS.
+
+See:
+* <https://forums.raspberrypi.com/viewtopic.php?t=275370>
+* <https://forums.raspberrypi.com/viewtopic.php?t=325218#p1946169>
+
+MMAL is not enabled by default. To enable it add
+
+ DISABLE_VC4GRAPHICS = "1"
+
+to `local.conf`. Adding `vlc` to `IMAGE_INSTALL` will then default to building the Raspberry
+Pi's Distro implementation of VLC with HW accelerated video decode through MMAL into the system
+image. It also defaults to building VLC with Raspberry PI's Distro implementation of ffmpeg. The
+oe-core implementation of ffmpeg and the meta-openembedded/meta-multimedia implementation of VLC
+can however be selected via:
+
+ PREFERRED_PROVIDER_ffmpeg = "ffmpeg"
+ PREFERRED_PROVIDER_vlc = "vlc"
+
+Usage example: Start VLC with mmal_vout plugin and without an active display server.
+
+ DISPLAYNUM=$(tvservice -l | tail -c 2)
+ MMAL_DISPLAY=$(expr $DISPLAYNUM + 1)
+ VLC_SETTINGS="-I dummy --vout=mmal_vout --mmal-resize --mmal-display hdmi-$MMAL_DISPLAY --no-dbus"
+ cvlc $VLC_SETTINGS <video/playlist>
diff --git a/docs/index.rst b/docs/index.rst
index 0d7ee07..3f8a088 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,6 +15,7 @@ Contents:
layer-contents
extra-build-config
extra-apps
+ ipcompliance
contributing
Indices and tables
diff --git a/docs/ipcompliance.md b/docs/ipcompliance.md
new file mode 100644
index 0000000..01540a8
--- /dev/null
+++ b/docs/ipcompliance.md
@@ -0,0 +1,23 @@
+# IP Compliance
+
+## linux-firmware-rpidistro
+
+By default, some of the machine configurations recommend packages for the
+WiFi/BT firmware, provided by
+[linux-firmware-rpidistro](https://github.com/RPi-Distro/firmware-nonfree).
+This package includes some firmware blobs under the `Synaptics` license which
+could carry a legal risk: one of the clauses can be (at least theoretically)
+used as a `killswitch`. This was
+[reported](https://github.com/RPi-Distro/firmware-nonfree/issues/29) in the
+upstream repository.
+
+You can find the full license text body in the content of the above mentioned
+package.
+
+Due to the above, the build system will only allow this recipe to be built if
+the user acknowledges this risk by adding the following configuration:
+
+ LICENSE_FLAGS_ACCEPTED = "synaptics-killswitch"
+
+You can provide this configuration as part of your `local.conf`, `distro.conf`,
+etc.
diff --git a/docs/layer-contents.md b/docs/layer-contents.md
index 5483708..3882339 100644
--- a/docs/layer-contents.md
+++ b/docs/layer-contents.md
@@ -5,6 +5,7 @@
* raspberrypi
* raspberrypi0
* raspberrypi0-wifi
+* raspberrypi0-2w-64
* raspberrypi2
* raspberrypi3
* raspberrypi3-64 (64 bit kernel & userspace)
@@ -15,6 +16,28 @@
Note: The raspberrypi3 machines include support for Raspberry Pi 3B+.
+## Multi-board Machines
+
+This layer generally provides support for machines that are targetting a single
+Raspberry Pi board (or a very few subsets of them). This is so that the build
+infrastructure can tune and tweak the configuration with the flexibility to
+optimise for both runtime performance and disk storage.
+
+For usecases where compatibility of more boards is required, the layer provides
+machines that are tagetting a wider support of Raspberry Pi boards.
+
+### raspberrypi-armv7
+
+This machine targets support for all the ARMv7-based Raspberry Pi boards. It
+will pull in the firmware and deploy the kernel image and kernel modules for
+all the relevant boards.
+
+### raspberrypi-armv8
+
+This machine targets support for all the ARMv8-based Raspberry Pi boards. It
+will pull in the firmware and deploy the kernel image and kernel modules for
+all the relevant boards.
+
## Images
* rpi-test-image
@@ -22,8 +45,7 @@ Note: The raspberrypi3 machines include support for Raspberry Pi 3B+.
layer and some media samples.
For other uses it's recommended to base images on `core-image-minimal` or
-`core-image-base` as appropriate. The old image names (`rpi-hwup-image` and
-`rpi-basic-image`) are deprecated.
+`core-image-base` as appropriate.
## WiFi and Bluetooth Firmware
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..51eebd0
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+myst_parser
+sphinx_rtd_theme
diff --git a/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc/lirc-gpio-ir-0.10.patch b/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc/lirc-gpio-ir-0.10.patch
deleted file mode 100644
index c0fdd18..0000000
--- a/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc/lirc-gpio-ir-0.10.patch
+++ /dev/null
@@ -1,175 +0,0 @@
-diff -ruN lirc-0.10.1.orig/lib/config_file.c lirc-0.10.1/lib/config_file.c
---- lirc-0.10.1.orig/lib/config_file.c 2017-09-10 17:52:19.000000000 +0900
-+++ lirc-0.10.1/lib/config_file.c 2019-06-26 00:39:45.734320696 +0900
-@@ -71,7 +71,7 @@
- typedef void* (*array_guest_func)(void* item, void* arg);
-
-
--#define LINE_LEN 1024
-+#define LINE_LEN 4096
- #define MAX_INCLUDES 10
-
- const char* whitespace = " \t";
-diff -ruN lirc-0.10.1.orig/lib/ir_remote.h lirc-0.10.1/lib/ir_remote.h
---- lirc-0.10.1.orig/lib/ir_remote.h 2017-09-10 17:52:19.000000000 +0900
-+++ lirc-0.10.1/lib/ir_remote.h 2019-06-26 00:39:45.714321224 +0900
-@@ -110,12 +110,17 @@
-
- static inline int is_pulse(lirc_t data)
- {
-- return data & PULSE_BIT ? 1 : 0;
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_PULSE) ? 1 : 0;
- }
-
- static inline int is_space(lirc_t data)
- {
-- return !is_pulse(data);
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_SPACE) ? 1 : 0;
-+}
-+
-+static inline int is_timeout(lirc_t data)
-+{
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_TIMEOUT) ? 1 : 0;
- }
-
- static inline int has_repeat(const struct ir_remote* remote)
-diff -ruN lirc-0.10.1.orig/lib/irrecord.c lirc-0.10.1/lib/irrecord.c
---- lirc-0.10.1.orig/lib/irrecord.c 2017-09-10 17:52:19.000000000 +0900
-+++ lirc-0.10.1/lib/irrecord.c 2019-06-26 00:39:45.724320960 +0900
-@@ -1398,9 +1398,16 @@
- state->retval = 0;
- return STS_LEN_TIMEOUT;
- }
-+ if (is_timeout(state->data)) {
-+ return STS_LEN_AGAIN;
-+ }
- state->count++;
- if (state->mode == MODE_GET_GAP) {
-- state->sum += state->data & PULSE_MASK;
-+ if (state->sum != 0 || is_pulse(state->data)) {
-+ state->sum += state->data & PULSE_MASK;
-+ }else{
-+ return STS_LEN_AGAIN;
-+ }
- if (state->average == 0 && is_space(state->data)) {
- if (state->data > 100000) {
- state->sum = 0;
-@@ -1472,6 +1479,10 @@
- state->keypresses = lastmaxcount;
- return STS_LEN_AGAIN;
- } else if (state->mode == MODE_HAVE_GAP) {
-+ if (state->count==1 && is_space(state->data)) {
-+ state->count = 0;
-+ return STS_LEN_AGAIN;
-+ }
- if (state->count <= MAX_SIGNALS) {
- signals[state->count - 1] = state->data & PULSE_MASK;
- } else {
-@@ -1510,7 +1521,7 @@
- /* such long pulses may appear with
- * crappy hardware (receiver? / remote?)
- */
-- else {
-+ else if(is_pulse(state->data)) {
- remote->gap = 0;
- return STS_LEN_NO_GAP_FOUND;
- }
-@@ -1811,22 +1822,24 @@
-
- static int raw_data_ok(struct button_state* btn_state)
- {
-- int r;
-+ int r = 0;
- int ref;
-
-- if (!is_space(btn_state->data)) {
-+ if (is_pulse(btn_state->data)) {
- r = 0;
-- } else if (is_const(&remote)) {
-- if (remote.gap > btn_state->sum) {
-- ref = (remote.gap - btn_state->sum);
-- ref *= (100 - remote.eps);
-- ref /= 100;
-+ } else if (is_space(btn_state->data)) {
-+ if (is_const(&remote)) {
-+ if (remote.gap > btn_state->sum) {
-+ ref = (remote.gap - btn_state->sum);
-+ ref *= (100 - remote.eps);
-+ ref /= 100;
-+ } else {
-+ ref = 0;
-+ }
-+ r = btn_state->data > ref;
- } else {
-- ref = 0;
-+ r = btn_state->data > (remote.gap * (100 - remote.eps)) / 100;
- }
-- r = btn_state->data > ref;
-- } else {
-- r = btn_state->data > (remote.gap * (100 - remote.eps)) / 100;
- }
- return r;
- }
-@@ -1970,7 +1983,7 @@
- btn_state->data = remote.gap;
- }
- if (btn_state->count == 0) {
-- if (!is_space(btn_state->data)
-+ if (is_pulse(btn_state->data)
- || btn_state->data <
- remote.gap - remote.gap * remote.eps /
- 100) {
-diff -ruN lirc-0.10.1.orig/lib/lirc/ir_remote.h lirc-0.10.1/lib/lirc/ir_remote.h
---- lirc-0.10.1.orig/lib/lirc/ir_remote.h 2017-09-10 17:52:58.000000000 +0900
-+++ lirc-0.10.1/lib/lirc/ir_remote.h 2019-06-26 00:39:45.724320960 +0900
-@@ -110,12 +110,17 @@
-
- static inline int is_pulse(lirc_t data)
- {
-- return data & PULSE_BIT ? 1 : 0;
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_PULSE) ? 1 : 0;
- }
-
- static inline int is_space(lirc_t data)
- {
-- return !is_pulse(data);
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_SPACE) ? 1 : 0;
-+}
-+
-+static inline int is_timeout(lirc_t data)
-+{
-+ return ((data & LIRC_MODE2_MASK)==LIRC_MODE2_TIMEOUT) ? 1 : 0;
- }
-
- static inline int has_repeat(const struct ir_remote* remote)
-diff -ruN lirc-0.10.1.orig/tools/mode2.cpp lirc-0.10.1/tools/mode2.cpp
---- lirc-0.10.1.orig/tools/mode2.cpp 2017-09-10 17:52:19.000000000 +0900
-+++ lirc-0.10.1/tools/mode2.cpp 2019-06-26 00:45:38.840404976 +0900
-@@ -326,12 +326,24 @@
- void print_mode2_data(unsigned int data)
- {
- static int bitno = 1;
-+ static bool leading_space = true;
-+ unsigned int msg = data & LIRC_MODE2_MASK;
-
- switch (opt_dmode) {
- case 0:
-- printf("%s %u\n", (
-- data & PULSE_BIT) ? "pulse" : "space",
-- (uint32_t)(data & PULSE_MASK));
-+ if (leading_space && msg == LIRC_MODE2_SPACE ) {
-+ break;
-+ } else {
-+ leading_space = false;
-+ }
-+ if (msg == LIRC_MODE2_PULSE) {
-+ printf("pulse %u\n", (__u32)(data & PULSE_MASK));
-+ } else if (msg == LIRC_MODE2_SPACE) {
-+ printf("space %u\n", (__u32)(data & PULSE_MASK));
-+ } else if (msg == LIRC_MODE2_TIMEOUT) {
-+ printf("timeout %u\n", (__u32)(data & PULSE_MASK));
-+ leading_space = true;
-+ }
- break;
- case 1: {
- /* print output like irrecord raw config file data */
diff --git a/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.%.bbappend b/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.%.bbappend
new file mode 100644
index 0000000..0ccd4f7
--- /dev/null
+++ b/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.%.bbappend
@@ -0,0 +1,5 @@
+FILESEXTRAPATHS:prepend := "${THISDIR}/${PN}:"
+
+SRC_URI:append:rpi = " \
+ file://lircd.service \
+"
diff --git a/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.1.bbappend b/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.1.bbappend
deleted file mode 100644
index 5d3ab4d..0000000
--- a/dynamic-layers/meta-python/recipes-connectivity/lirc/lirc_0.10.1.bbappend
+++ /dev/null
@@ -1,6 +0,0 @@
-FILESEXTRAPATHS_prepend := "${THISDIR}/${PN}:"
-
-SRC_URI_append_rpi = " \
- file://lirc-gpio-ir-0.10.patch \
- file://lircd.service \
-"
diff --git a/dynamic-layers/meta-python/recipes-core/packagegroups/packagegroup-rpi-test.bbappend b/dynamic-layers/meta-python/recipes-core/packagegroups/packagegroup-rpi-test.bbappend
index 63fedbb..7776531 100644
--- a/dynamic-layers/meta-python/recipes-core/packagegroups/packagegroup-rpi-test.bbappend
+++ b/dynamic-layers/meta-python/recipes-core/packagegroups/packagegroup-rpi-test.bbappend
@@ -1,4 +1,4 @@
-FILESEXTRAPATHS_prepend := "${THISDIR}/${PN}:"
+FILESEXTRAPATHS:prepend := "${THISDIR}/${PN}:"
-RDEPENDS_${PN} += "python3-sense-hat"
+RDEPENDS:${PN} += "python3-sense-hat"
diff --git a/dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb b/dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb
index 6195d28..cf745fc 100644
--- a/dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb
+++ b/dynamic-layers/meta-python/recipes-devtools/python/python3-sense-hat_2.2.0.bb
@@ -1,7 +1,7 @@
SUMMARY = "Python module to control the Raspberry Pi Sense HAT used in the Astro Pi mission"
HOMEPAGE = "https://github.com/RPi-Distro/python-sense-hat"
SECTION = "devel/python"
-LICENSE = "BSD"
+LICENSE = "BSD-3-Clause"
LIC_FILES_CHKSUM = "file://LICENCE.txt;md5=d80fe312e1ff5fbd97369b093bf21cda"
inherit setuptools3 pypi
@@ -17,8 +17,8 @@ DEPENDS += " \
freetype \
"
-RDEPENDS_${PN} += " \
- ${PYTHON_PN}-numpy \
- ${PYTHON_PN}-rtimu \
- ${PYTHON_PN}-pillow \
+RDEPENDS:${PN} += " \
+ python3-numpy \
+ python3-rtimu \
+ python3-pillow \
"
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0001-utils-version.py-use-usr-bin-env-in-shebang.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0001-utils-version.py-use-usr-bin-env-in-shebang.patch
new file mode 100644
index 0000000..15f6bf4
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0001-utils-version.py-use-usr-bin-env-in-shebang.patch
@@ -0,0 +1,42 @@
+From bbc1ea3e4119c665723cfd1c5a364bc8c7cbb464 Mon Sep 17 00:00:00 2001
+From: Martin Jansa <Martin.Jansa@gmail.com>
+Date: Thu, 4 May 2023 18:07:16 +0000
+Subject: [PATCH] utils/version.py: use /usr/bin/env in shebang
+
+* it uses subprocess text=True which is available only since python-3.7
+ when running on host with python-3.6 it fails with:
+Traceback (most recent call last):
+ File "TOPDIR/BUILD/work/raspberrypi4_64-oe-linux/rpi-libcamera-apps/git-r0/git/utils/version.py", line 19, in generate_version
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, text=True)
+ File "/usr/lib/python3.6/subprocess.py", line 423, in run
+ with Popen(*popenargs, **kwargs) as process:
+TypeError: __init__() got an unexpected keyword argument 'text'
+
+During handling of the above exception, another exception occurred:
+
+Traceback (most recent call last):
+ File "TOPDIR/BUILD/work/raspberrypi4_64-oe-linux/rpi-libcamera-apps/git-r0/git/utils/version.py", line 52, in <module>
+ generate_version()
+ File "TOPDIR/BUILD/work/raspberrypi4_64-oe-linux/rpi-libcamera-apps/git-r0/git/utils/version.py", line 48, in generate_version
+ print(f'{commit} {datetime.now().strftime("%d-%m-%Y (%H:%M:%S)")}', end="")
+UnboundLocalError: local variable 'commit' referenced before assignment
+Generating version string:
+
+ even when newer python3 is in PATH (either from buildtools or from python3native)
+
+Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
+Upstream-Status: Pending
+---
+ utils/version.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/utils/version.py b/utils/version.py
+index 48d7e05..4a5e35c 100755
+--- a/utils/version.py
++++ b/utils/version.py
+@@ -1,4 +1,4 @@
+-#!/usr/bin/python3
++#!/usr/bin/env python3
+
+ # Copyright (C) 2021, Raspberry Pi (Trading) Limited
+ # Generate version information for rpicam-apps
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0002-Revert-Support-compressed-pixel-formats-when-saving-.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0002-Revert-Support-compressed-pixel-formats-when-saving-.patch
new file mode 100644
index 0000000..c965b2c
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps/0002-Revert-Support-compressed-pixel-formats-when-saving-.patch
@@ -0,0 +1,271 @@
+From 500f1e9eaeca29b255d0364e1383d70ade1d1177 Mon Sep 17 00:00:00 2001
+From: Martin Jansa <martin.jansa@gmail.com>
+Date: Tue, 30 Jan 2024 12:02:09 +0000
+Subject: [PATCH] Revert "Support compressed pixel formats when saving DNGs"
+
+This reverts commit a85aed7603a0b69a6685d3f81ee860246d5b1621.
+
+This requires rpi specific fork of libcamera to provide e.g.
+formats::RGGB16_PISP_COMP1
+added in:
+https://github.com/raspberrypi/libcamera/commit/fb3cb844f2117f30d3eeece99d6ce4d02624e492
+but not included in libcamera from meta-oe:
+https://git.openembedded.org/meta-openembedded/commit/?id=711c6fbce39df685225bca081c5f42bae2de658b
+
+See https://github.com/raspberrypi/rpicam-apps/issues/627
+
+Upstream-Status: Pending
+---
+ image/dng.cpp | 205 ++++++++------------------------------------------
+ 1 file changed, 33 insertions(+), 172 deletions(-)
+
+diff --git a/image/dng.cpp b/image/dng.cpp
+index 7692f92..fc10439 100644
+--- a/image/dng.cpp
++++ b/image/dng.cpp
+@@ -33,47 +33,40 @@ struct BayerFormat
+ int bits;
+ char const *order;
+ bool packed;
+- bool compressed;
+ };
+
+ static const std::map<PixelFormat, BayerFormat> bayer_formats =
+ {
+- { formats::SRGGB10_CSI2P, { "RGGB-10", 10, TIFF_RGGB, true, false } },
+- { formats::SGRBG10_CSI2P, { "GRBG-10", 10, TIFF_GRBG, true, false } },
+- { formats::SBGGR10_CSI2P, { "BGGR-10", 10, TIFF_BGGR, true, false } },
+- { formats::SGBRG10_CSI2P, { "GBRG-10", 10, TIFF_GBRG, true, false } },
+-
+- { formats::SRGGB10, { "RGGB-10", 10, TIFF_RGGB, false, false } },
+- { formats::SGRBG10, { "GRBG-10", 10, TIFF_GRBG, false, false } },
+- { formats::SBGGR10, { "BGGR-10", 10, TIFF_BGGR, false, false } },
+- { formats::SGBRG10, { "GBRG-10", 10, TIFF_GBRG, false, false } },
+-
+- { formats::SRGGB12_CSI2P, { "RGGB-12", 12, TIFF_RGGB, true, false } },
+- { formats::SGRBG12_CSI2P, { "GRBG-12", 12, TIFF_GRBG, true, false } },
+- { formats::SBGGR12_CSI2P, { "BGGR-12", 12, TIFF_BGGR, true, false } },
+- { formats::SGBRG12_CSI2P, { "GBRG-12", 12, TIFF_GBRG, true, false } },
+-
+- { formats::SRGGB12, { "RGGB-12", 12, TIFF_RGGB, false, false } },
+- { formats::SGRBG12, { "GRBG-12", 12, TIFF_GRBG, false, false } },
+- { formats::SBGGR12, { "BGGR-12", 12, TIFF_BGGR, false, false } },
+- { formats::SGBRG12, { "GBRG-12", 12, TIFF_GBRG, false, false } },
+-
+- { formats::SRGGB16, { "RGGB-16", 16, TIFF_RGGB, false, false } },
+- { formats::SGRBG16, { "GRBG-16", 16, TIFF_GRBG, false, false } },
+- { formats::SBGGR16, { "BGGR-16", 16, TIFF_BGGR, false, false } },
+- { formats::SGBRG16, { "GBRG-16", 16, TIFF_GBRG, false, false } },
+-
+- { formats::R10_CSI2P, { "BGGR-10", 10, TIFF_BGGR, true, false } },
+- { formats::R10, { "BGGR-10", 10, TIFF_BGGR, false, false } },
++ { formats::SRGGB10_CSI2P, { "RGGB-10", 10, TIFF_RGGB, true } },
++ { formats::SGRBG10_CSI2P, { "GRBG-10", 10, TIFF_GRBG, true } },
++ { formats::SBGGR10_CSI2P, { "BGGR-10", 10, TIFF_BGGR, true } },
++ { formats::SGBRG10_CSI2P, { "GBRG-10", 10, TIFF_GBRG, true } },
++
++ { formats::SRGGB10, { "RGGB-10", 10, TIFF_RGGB, false } },
++ { formats::SGRBG10, { "GRBG-10", 10, TIFF_GRBG, false } },
++ { formats::SBGGR10, { "BGGR-10", 10, TIFF_BGGR, false } },
++ { formats::SGBRG10, { "GBRG-10", 10, TIFF_GBRG, false } },
++
++ { formats::SRGGB12_CSI2P, { "RGGB-12", 12, TIFF_RGGB, true } },
++ { formats::SGRBG12_CSI2P, { "GRBG-12", 12, TIFF_GRBG, true } },
++ { formats::SBGGR12_CSI2P, { "BGGR-12", 12, TIFF_BGGR, true } },
++ { formats::SGBRG12_CSI2P, { "GBRG-12", 12, TIFF_GBRG, true } },
++
++ { formats::SRGGB12, { "RGGB-12", 12, TIFF_RGGB, false } },
++ { formats::SGRBG12, { "GRBG-12", 12, TIFF_GRBG, false } },
++ { formats::SBGGR12, { "BGGR-12", 12, TIFF_BGGR, false } },
++ { formats::SGBRG12, { "GBRG-12", 12, TIFF_GBRG, false } },
++
++ { formats::SRGGB16, { "RGGB-16", 16, TIFF_RGGB, false } },
++ { formats::SGRBG16, { "GRBG-16", 16, TIFF_GRBG, false } },
++ { formats::SBGGR16, { "BGGR-16", 16, TIFF_BGGR, false } },
++ { formats::SGBRG16, { "GBRG-16", 16, TIFF_GBRG, false } },
++
++ { formats::R10_CSI2P, { "BGGR-10", 10, TIFF_BGGR, true } },
++ { formats::R10, { "BGGR-10", 10, TIFF_BGGR, false } },
+ // Currently not in the main libcamera branch
+ //{ formats::R12_CSI2P, { "BGGR-12", 12, TIFF_BGGR, true } },
+- { formats::R12, { "BGGR-12", 12, TIFF_BGGR, false, false } },
+-
+- /* PiSP compressed formats. */
+- { formats::RGGB16_PISP_COMP1, { "RGGB-16-PISP", 16, TIFF_RGGB, false, true } },
+- { formats::GRBG16_PISP_COMP1, { "GRBG-16-PISP", 16, TIFF_GRBG, false, true } },
+- { formats::GBRG16_PISP_COMP1, { "GBRG-16-PISP", 16, TIFF_GBRG, false, true } },
+- { formats::BGGR16_PISP_COMP1, { "BGGR-16-PISP", 16, TIFF_BGGR, false, true } },
++ { formats::R12, { "BGGR-12", 12, TIFF_BGGR, false } },
+ };
+
+ static void unpack_10bit(uint8_t const *src, StreamInfo const &info, uint16_t *dest)
+@@ -124,129 +117,6 @@ static void unpack_16bit(uint8_t const *src, StreamInfo const &info, uint16_t *d
+ }
+ }
+
+-// We always use these compression parameters.
+-#define COMPRESS_OFFSET 2048
+-#define COMPRESS_MODE 1
+-
+-static uint16_t postprocess(uint16_t a)
+-{
+- if (COMPRESS_MODE & 2)
+- {
+- if (COMPRESS_MODE == 3 && a < 0x4000)
+- a = a >> 2;
+- else if (a < 0x1000)
+- a = a >> 4;
+- else if (a < 0x1800)
+- a = (a - 0x800) >> 3;
+- else if (a < 0x3000)
+- a = (a - 0x1000) >> 2;
+- else if (a < 0x6000)
+- a = (a - 0x2000) >> 1;
+- else if (a < 0xC000)
+- a = (a - 0x4000);
+- else
+- a = 2 * (a - 0x8000);
+- }
+-
+- return std::min(0xFFFF, a + COMPRESS_OFFSET);
+-}
+-
+-static uint16_t dequantize(uint16_t q, int qmode)
+-{
+- switch (qmode)
+- {
+- case 0:
+- return (q < 320) ? 16 * q : 32 * (q - 160);
+-
+- case 1:
+- return 64 * q;
+-
+- case 2:
+- return 128 * q;
+-
+- default:
+- return (q < 94) ? 256 * q : std::min(0xFFFF, 512 * (q - 47));
+- }
+-}
+-
+-static void subBlockFunction(uint16_t *d, uint32_t w)
+-{
+- int q[4];
+-
+- int qmode = (w & 3);
+- if (qmode < 3)
+- {
+- int field0 = (w >> 2) & 511;
+- int field1 = (w >> 11) & 127;
+- int field2 = (w >> 18) & 127;
+- int field3 = (w >> 25) & 127;
+- if (qmode == 2 && field0 >= 384)
+- {
+- q[1] = field0;
+- q[2] = field1 + 384;
+- }
+- else
+- {
+- q[1] = (field1 >= 64) ? field0 : field0 + 64 - field1;
+- q[2] = (field1 >= 64) ? field0 + field1 - 64 : field0;
+- }
+- int p1 = std::max(0, q[1] - 64);
+- if (qmode == 2)
+- p1 = std::min(384, p1);
+- int p2 = std::max(0, q[2] - 64);
+- if (qmode == 2)
+- p2 = std::min(384, p2);
+- q[0] = p1 + field2;
+- q[3] = p2 + field3;
+- }
+- else
+- {
+- int pack0 = (w >> 2) & 32767;
+- int pack1 = (w >> 17) & 32767;
+- q[0] = (pack0 & 15) + 16 * ((pack0 >> 8) / 11);
+- q[1] = (pack0 >> 4) % 176;
+- q[2] = (pack1 & 15) + 16 * ((pack1 >> 8) / 11);
+- q[3] = (pack1 >> 4) % 176;
+- }
+-
+- d[0] = dequantize(q[0], qmode);
+- d[2] = dequantize(q[1], qmode);
+- d[4] = dequantize(q[2], qmode);
+- d[6] = dequantize(q[3], qmode);
+-}
+-
+-static void uncompress(uint8_t const *src, StreamInfo const &info, uint16_t *dest)
+-{
+- // In all cases, the *decompressed* image must be a multiple of 8 columns wide.
+- unsigned int buf_stride_pixels = (info.width + 7) & ~7;
+- for (unsigned int y = 0; y < info.height; ++y)
+- {
+- uint16_t *dp = dest + y * buf_stride_pixels;
+- uint8_t const *sp = src + y * info.stride;
+-
+- for (unsigned int x = 0; x < info.width; x+=8)
+- {
+- if (COMPRESS_MODE & 1)
+- {
+- uint32_t w0 = 0, w1 = 0;
+- for (int b = 0; b < 4; ++b)
+- w0 |= (*sp++) << (b * 8);
+- for (int b = 0; b < 4; ++b)
+- w1 |= (*sp++) << (b * 8);
+- subBlockFunction(dp, w0);
+- subBlockFunction(dp + 1, w1);
+- for (int i = 0; i < 8; ++i, ++dp)
+- *dp = postprocess(*dp);
+- }
+- else
+- {
+- for (int i = 0; i < 8; ++i)
+- *dp++ = postprocess((*sp++) << 8);
+- }
+- }
+- }
+-}
+-
+ struct Matrix
+ {
+ Matrix(float m0, float m1, float m2,
+@@ -307,16 +177,8 @@ void dng_save(std::vector<libcamera::Span<uint8_t>> const &mem, StreamInfo const
+ BayerFormat const &bayer_format = it->second;
+ LOG(1, "Bayer format is " << bayer_format.name);
+
+- // Decompression will require a buffer that's 8 pixels aligned.
+- unsigned int buf_stride_pixels = info.width;
+- unsigned int buf_stride_pixels_padded = (buf_stride_pixels + 7) & ~7;
+- std::vector<uint16_t> buf(buf_stride_pixels_padded * info.height);
+- if (bayer_format.compressed)
+- {
+- uncompress(mem[0].data(), info, &buf[0]);
+- buf_stride_pixels = buf_stride_pixels_padded;
+- }
+- else if (bayer_format.packed)
++ std::vector<uint16_t> buf(info.width * info.height);
++ if (bayer_format.packed)
+ {
+ switch (bayer_format.bits)
+ {
+@@ -444,9 +306,8 @@ void dng_save(std::vector<libcamera::Span<uint8_t>> const &mem, StreamInfo const
+ {
+ for (unsigned int x = 0; x < (info.width >> 4); x++)
+ {
+- unsigned int off = (y * buf_stride_pixels + x) << 4;
+- uint32_t grey =
+- buf[off] + buf[off + 1] + buf[off + buf_stride_pixels] + buf[off + buf_stride_pixels + 1];
++ unsigned int off = (y * info.width + x) << 4;
++ uint32_t grey = buf[off] + buf[off + 1] + buf[off + info.width] + buf[off + info.width + 1];
+ grey = (grey << 14) >> bayer_format.bits;
+ grey = sqrt((double)grey); // simple "gamma correction"
+ thumb_buf[3 * x] = thumb_buf[3 * x + 1] = thumb_buf[3 * x + 2] = grey;
+@@ -478,7 +339,7 @@ void dng_save(std::vector<libcamera::Span<uint8_t>> const &mem, StreamInfo const
+
+ for (unsigned int y = 0; y < info.height; y++)
+ {
+- if (TIFFWriteScanline(tif, &buf[buf_stride_pixels * y], y, 0) != 1)
++ if (TIFFWriteScanline(tif, &buf[info.width * y], y, 0) != 1)
+ throw std::runtime_error("error writing DNG image data");
+ }
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps_git.bb b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps_git.bb
new file mode 100644
index 0000000..dc07145
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera-apps/libcamera-apps_git.bb
@@ -0,0 +1,44 @@
+SUMMARY = "A suite of libcamera-based apps"
+DESCRIPTION = "This is a small suite of libcamera-based apps that aim to \
+copy the functionality of the existing \"raspicam\" apps."
+HOMEPAGE = "https://github.com/raspberrypi/libcamera-apps"
+SECTION = "console/utils"
+
+LICENSE = "BSD-2-Clause"
+LIC_FILES_CHKSUM = "file://license.txt;md5=a0013d1b383d72ba4bdc5b750e7d1d77"
+
+SRC_URI = "\
+ git://github.com/raspberrypi/libcamera-apps.git;protocol=https;branch=main \
+ file://0001-utils-version.py-use-usr-bin-env-in-shebang.patch \
+ file://0002-Revert-Support-compressed-pixel-formats-when-saving-.patch \
+"
+PV = "1.4.2+git${SRCPV}"
+SRCREV = "9ae39f85ae6bee9761c36b9b5b80d675bc1fa369"
+
+S = "${WORKDIR}/git"
+
+DEPENDS = "libcamera libexif jpeg tiff libpng boost"
+
+PACKAGECONFIG ??= "drm"
+PACKAGECONFIG[libav] = "-Denable_libav=true, -Denable_libav=false, libav"
+PACKAGECONFIG[drm] = "-Denable_drm=true, -Denable_drm=false, libdrm"
+PACKAGECONFIG[egl] = "-Denable_egl=true, -Denable_egl=false, virtual/egl"
+PACKAGECONFIG[qt] = "-Denable_qt=true, -Denable_qt=false, qtbase"
+PACKAGECONFIG[opencv] = "-Denable_opencv=true, -Denable_opencv=false, opencv"
+PACKAGECONFIG[tflite] = "-Denable_tflite=true, -Denable_tflite=false, tensorflow-lite"
+
+inherit meson pkgconfig
+
+NEON_FLAGS = ""
+NEON_FLAGS:aarch64 = "-Dneon_flags=arm64"
+NEON_FLAGS:arm:raspberrypi3 = "-Dneon_flags=armv8-neon"
+NEON_FLAGS:arm:raspberrypi4 = "-Dneon_flags=armv8-neon"
+EXTRA_OEMESON += "${NEON_FLAGS}"
+
+# QA Issue: /usr/bin/camera-bug-report contained in package libcamera-apps requires /usr/bin/python3
+do_install:append() {
+ rm -v ${D}/${bindir}/camera-bug-report
+}
+
+# not picked automatically, because it's missing common 'lib' prefix
+FILES:${PN}-dev += "${libdir}/rpicam_app.so"
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera.bbappend b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera.bbappend
deleted file mode 100644
index 8076ac7..0000000
--- a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera.bbappend
+++ /dev/null
@@ -1,2 +0,0 @@
-PACKAGECONFIG[raspberrypi] = "-Dpipelines=raspberrypi"
-PACKAGECONFIG_append_rpi = " raspberrypi"
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera_%.bbappend b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera_%.bbappend
new file mode 100644
index 0000000..541c49c
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/libcamera/libcamera_%.bbappend
@@ -0,0 +1,2 @@
+PACKAGECONFIG[raspberrypi] = "-Dpipelines=rpi/vc4 -Dipas=rpi/vc4 -Dcpp_args=-Wno-unaligned-access"
+PACKAGECONFIG:append:rpi = " raspberrypi"
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0001-configure-fix-linking-on-RISC-V-ISA.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0001-configure-fix-linking-on-RISC-V-ISA.patch
new file mode 100644
index 0000000..3be8f1e
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0001-configure-fix-linking-on-RISC-V-ISA.patch
@@ -0,0 +1,25 @@
+From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= <remi@remlab.net>
+Date: Sat, 16 Jun 2018 21:31:45 +0300
+Subject: configure: fix linking on RISC-V ISA
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+---
+ configure.ac | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/configure.ac b/configure.ac
+index 2037a9e..df26367 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -113,6 +113,7 @@ case "${host_os}" in
+ ;;
+ linux*)
+ SYS=linux
++ test "${host_cpu}" = "riscv64" && CFLAGS="${CFLAGS} -pthread"
+ ;;
+ bsdi*)
+ SYS=bsdi
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0002-Revert-configure-Require-libmodplug-0.8.9.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0002-Revert-configure-Require-libmodplug-0.8.9.patch
new file mode 100644
index 0000000..61807b3
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0002-Revert-configure-Require-libmodplug-0.8.9.patch
@@ -0,0 +1,27 @@
+From: Sebastian Ramacher <sramacher@debian.org>
+Date: Mon, 19 Aug 2019 21:08:26 +0200
+Subject: Revert "configure: Require libmodplug >= 0.8.9"
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+This reverts commit 48f014768dc22ecad23d0e9f53c38805a3aff832.
+---
+ configure.ac | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/configure.ac b/configure.ac
+index df26367..b8580ec 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -2207,7 +2207,7 @@ AC_ARG_ENABLE(mod,
+ [AS_HELP_STRING([--disable-mod],
+ [do not use libmodplug (default auto)])])
+ if test "${enable_mod}" != "no" ; then
+- PKG_CHECK_MODULES(LIBMODPLUG, [libmodplug >= 0.8.9.0], [
++ PKG_CHECK_MODULES(LIBMODPLUG, [libmodplug >= 0.8.4 libmodplug != 0.8.8], [
+ VLC_ADD_PLUGIN([mod])
+ VLC_ADD_CXXFLAGS([mod],[$LIBMODPLUG_CFLAGS])
+ VLC_ADD_CFLAGS([mod],[$LIBMODPLUG_CFLAGS]) #modules/demux/mod.c needs CFLAGS_mod, not CXXFLAGS_mod
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0003-CVE-2022-41325.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0003-CVE-2022-41325.patch
new file mode 100644
index 0000000..41f7109
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0003-CVE-2022-41325.patch
@@ -0,0 +1,83 @@
+From 4fcace61801f418786c42487c6b06b693ee87666 Mon Sep 17 00:00:00 2001
+From: Romain Vimont <rom1v@videolabs.io>
+Date: Mon, 19 Sep 2022 17:17:01 +0200
+Subject: [PATCH] vnc: fix possible buffer overflow
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+Thanks to 0xMitsurugi [1] from Synacktiv [2] for the bug report and fix.
+
+[1] https://twitter.com/0xMitsurugi
+[2] https://www.synacktiv.com/
+
+Fixes #27335
+
+(cherry picked from commit 5eb783fd44ed6298db3e38f7765f21c42e4405f9)
+---
+ modules/access/vnc.c | 23 ++++++++++++++++-------
+ 1 file changed, 16 insertions(+), 7 deletions(-)
+
+--- a/modules/access/vnc.c
++++ b/modules/access/vnc.c
+@@ -33,6 +33,7 @@
+ #ifdef HAVE_CONFIG_H
+ # include "config.h"
+ #endif
++#include <assert.h>
+
+ #include <vlc_common.h>
+ #include <vlc_plugin.h>
+@@ -115,7 +116,7 @@
+ int i_cancel_state;
+
+ rfbClient* p_client;
+- int i_framebuffersize;
++ size_t i_framebuffersize;
+ block_t *p_block;
+
+ float f_fps;
+@@ -143,11 +144,16 @@
+ p_sys->es = NULL;
+ }
+
+- int i_width = p_client->width;
+- int i_height = p_client->height;
+- int i_depth = p_client->format.bitsPerPixel;
++ assert(!(p_client->width & ~0xffff)); // fits in 16 bits
++ uint16_t i_width = p_client->width;
+
+- switch( i_depth )
++ assert(!(p_client->height & ~0xffff)); // fits in 16 bits
++ uint16_t i_height = p_client->height;
++
++ uint8_t i_bits_per_pixel = p_client->format.bitsPerPixel;
++ assert((i_bits_per_pixel & 0x7) == 0); // multiple of 8
++
++ switch( i_bits_per_pixel )
+ {
+ case 8:
+ i_chroma = VLC_CODEC_RGB8;
+@@ -180,7 +186,10 @@
+ }
+
+ /* Set up framebuffer */
+- p_sys->i_framebuffersize = i_width * i_height * i_depth / 8;
++ if (mul_overflow(i_width, i_height * (i_bits_per_pixel / 8), &p_sys->i_framebuffersize)) {
++ msg_Err(p_demux, "VNC framebuffersize overflow");
++ return FALSE;
++ }
+
+ /* Reuse unsent block */
+ if ( p_sys->p_block )
+@@ -211,7 +220,7 @@
+ fmt.video.i_frame_rate_base = 1000;
+ fmt.video.i_frame_rate = 1000 * p_sys->f_fps;
+
+- fmt.video.i_bits_per_pixel = i_depth;
++ fmt.video.i_bits_per_pixel = i_bits_per_pixel;
+ fmt.video.i_rmask = p_client->format.redMax << p_client->format.redShift;
+ fmt.video.i_gmask = p_client->format.greenMax << p_client->format.greenShift;
+ fmt.video.i_bmask = p_client->format.blueMax << p_client->format.blueShift;
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0004-mmal_20.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0004-mmal_20.patch
new file mode 100644
index 0000000..ab31730
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0004-mmal_20.patch
@@ -0,0 +1,13826 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+--- a/configure.ac
++++ b/configure.ac
+@@ -3478,6 +3478,9 @@ dnl
+ AC_ARG_ENABLE(mmal,
+ AS_HELP_STRING([--enable-mmal],
+ [Multi-Media Abstraction Layer (MMAL) hardware plugin (default enable)]))
++AC_ARG_ENABLE(mmal_avcodec,
++ AS_HELP_STRING([--enable-mmal-avcodec],
++ [Use MMAL enabled avcodec libs (default disable)]))
+ if test "${enable_mmal}" != "no"; then
+ VLC_SAVE_FLAGS
+ LDFLAGS="${LDFLAGS} -L/opt/vc/lib -lvchostif"
+@@ -3488,7 +3491,7 @@ if test "${enable_mmal}" != "no"; then
+ VLC_ADD_PLUGIN([mmal])
+ VLC_ADD_LDFLAGS([mmal],[ -L/opt/vc/lib ])
+ VLC_ADD_CFLAGS([mmal],[ -isystem /opt/vc/include -isystem /opt/vc/include/interface/vcos/pthreads -isystem /opt/vc/include/interface/vmcs_host/linux ])
+- VLC_ADD_LIBS([mmal],[ -lbcm_host -lmmal -lmmal_core -lmmal_components -lmmal_util -lvchostif ]) ], [
++ VLC_ADD_LIBS([mmal],[ -lbcm_host -lmmal -lmmal_core -lmmal_components -lmmal_util -lvchostif -lvchiq_arm -lvcsm ]) ], [
+ AS_IF([test "${enable_mmal}" = "yes"],
+ [ AC_MSG_ERROR([Cannot find bcm library...]) ],
+ [ AC_MSG_WARN([Cannot find bcm library...]) ])
+@@ -3500,6 +3503,7 @@ if test "${enable_mmal}" != "no"; then
+ VLC_RESTORE_FLAGS
+ fi
+ AM_CONDITIONAL([HAVE_MMAL], [test "${have_mmal}" = "yes"])
++AM_CONDITIONAL([HAVE_MMAL_AVCODEC], [test "${enable_mmal_avcodec}" = "yes"])
+
+ dnl
+ dnl evas plugin
+--- a/include/vlc_fourcc.h
++++ b/include/vlc_fourcc.h
+@@ -365,6 +365,11 @@
+
+ /* Broadcom MMAL opaque buffer type */
+ #define VLC_CODEC_MMAL_OPAQUE VLC_FOURCC('M','M','A','L')
++#define VLC_CODEC_MMAL_ZC_SAND8 VLC_FOURCC('Z','S','D','8')
++#define VLC_CODEC_MMAL_ZC_SAND10 VLC_FOURCC('Z','S','D','0')
++#define VLC_CODEC_MMAL_ZC_SAND30 VLC_FOURCC('Z','S','D','3')
++#define VLC_CODEC_MMAL_ZC_I420 VLC_FOURCC('Z','4','2','0')
++#define VLC_CODEC_MMAL_ZC_RGB32 VLC_FOURCC('Z','R','G','B')
+
+ /* DXVA2 opaque video surface for use with D3D9 */
+ #define VLC_CODEC_D3D9_OPAQUE VLC_FOURCC('D','X','A','9') /* 4:2:0 8 bpc */
+--- a/modules/hw/mmal/Makefile.am
++++ b/modules/hw/mmal/Makefile.am
+@@ -1,23 +1,57 @@
+ include $(top_srcdir)/modules/common.am
+ mmaldir = $(pluginsdir)/mmal
+
+-AM_CFLAGS += $(CFLAGS_mmal)
+-AM_LDFLAGS += -rpath '$(mmaldir)' $(LDFLAGS_mmal)
++AM_CFLAGS += -pthread $(CFLAGS_mmal)
++AM_LDFLAGS += -pthread -rpath '$(mmaldir)' $(LDFLAGS_mmal)
+
+-libmmal_vout_plugin_la_SOURCES = vout.c mmal_picture.c mmal_picture.h
++libmmal_vout_plugin_la_SOURCES = vout.c mmal_cma.c mmal_picture.c subpic.c\
++ mmal_cma.h mmal_picture.h subpic.h transform_ops.h\
++ mmal_piccpy_neon.S
+ libmmal_vout_plugin_la_CFLAGS = $(AM_CFLAGS)
+-libmmal_vout_plugin_la_LDFLAGS = $(AM_LDFLAGS) -lm
++libmmal_vout_plugin_la_LDFLAGS = $(AM_LDFLAGS) -lm -lX11 -lXrandr
+ libmmal_vout_plugin_la_LIBADD = $(LIBS_mmal)
+ mmal_LTLIBRARIES = libmmal_vout_plugin.la
+
+-libmmal_codec_plugin_la_SOURCES = codec.c
++libmmal_codec_plugin_la_SOURCES = codec.c mmal_cma.c mmal_picture.c subpic.c\
++ mmal_cma.h mmal_picture.h subpic.h transform_ops.h\
++ blend_rgba_neon.S mmal_piccpy_neon.S
+ libmmal_codec_plugin_la_CFLAGS = $(AM_CFLAGS)
+ libmmal_codec_plugin_la_LDFLAGS = $(AM_LDFLAGS)
+ libmmal_codec_plugin_la_LIBADD = $(LIBS_mmal)
+ mmal_LTLIBRARIES += libmmal_codec_plugin.la
+
+-libmmal_deinterlace_plugin_la_SOURCES = deinterlace.c mmal_picture.c
++libmmal_deinterlace_plugin_la_SOURCES = deinterlace.c mmal_picture.c mmal_cma.c\
++ mmal_cma.h mmal_picture.h transform_ops.h\
++ mmal_piccpy_neon.S
+ libmmal_deinterlace_plugin_la_CFLAGS = $(AM_CFLAGS)
+ libmmal_deinterlace_plugin_la_LDFLAGS = $(AM_LDFLAGS)
+ libmmal_deinterlace_plugin_la_LIBADD = $(LIBS_mmal)
+ mmal_LTLIBRARIES += libmmal_deinterlace_plugin.la
++
++libmmal_xsplitter_plugin_la_SOURCES = xsplitter.c mmal_picture.c mmal_cma.c\
++ mmal_cma.h mmal_picture.h transform_ops.h\
++ mmal_piccpy_neon.S
++libmmal_xsplitter_plugin_la_CFLAGS = $(AM_CFLAGS)
++libmmal_xsplitter_plugin_la_LDFLAGS = $(AM_LDFLAGS)
++libmmal_xsplitter_plugin_la_LIBADD = $(LIBS_mmal)
++mmal_LTLIBRARIES += libmmal_xsplitter_plugin.la
++
++libmmal_converter_plugin_la_SOURCES = converter_mmal.c mmal_cma.c mmal_picture.c\
++ mmal_cma.h mmal_picture.h transform_ops.h\
++ mmal_piccpy_neon.S
++libmmal_converter_plugin_la_CFLAGS = $(AM_CFLAGS)
++libmmal_converter_plugin_la_LDFLAGS = $(AM_LDFLAGS)
++libmmal_converter_plugin_la_LIBADD = $(LIBS_mmal)
++mmal_LTLIBRARIES += libmmal_converter_plugin.la
++
++if HAVE_MMAL_AVCODEC
++libmmal_avcodec_plugin_la_SOURCES = mmal_avcodec.c mmal_cma.c mmal_picture.c\
++ mmal_cma.h mmal_picture.h transform_ops.h\
++ mmal_piccpy_neon.S
++libmmal_avcodec_plugin_la_CFLAGS = $(AM_CFLAGS)
++libmmal_avcodec_plugin_la_LDFLAGS = $(AM_LDFLAGS)
++libmmal_avcodec_plugin_la_LIBADD = $(AVFORMAT_LIBS) $(AVUTIL_LIBS) $(LIBS_mmal)
++mmal_LTLIBRARIES += libmmal_avcodec_plugin.la
++endif
++
++
+--- /dev/null
++++ b/modules/hw/mmal/blend_rgba_neon.S
+@@ -0,0 +1,197 @@
++ .syntax unified
++ .arm
++// .thumb
++ .text
++ .align 16
++ .arch armv7-a
++ .fpu neon-vfpv4
++
++@ blend_rgbx_rgba_neon
++
++@ Implements /255 as ((x * 257) + 0x8000) >> 16
++@ This generates something in the range [(x+126)/255, (x+127)/255] which is good enough
++
++@ There is advantage to aligning src and/or dest - dest gives a bit more due to being used twice
++
++
++
++@ [r0] RGBx dest loaded into d20-d23
++@ [r1] RGBA src merge loaded into d16-d19
++@ r2 plane alpha
++@ r3 count (pixels)
++
++.macro blend_main sR, sG, sB, sA, dR, dG, dB, dA
++
++ push { r4, lr }
++
++ vdup.u8 d7, r2
++
++ subs r3, #8
++ vmov.u8 d6, #0xff
++
++ blt 2f
++
++ @ If < 16 bytes to move then don't bother trying to align
++ @ (a) This means the the align doesn't need to worry about r3 underflow
++ @ (b) The overhead would be greater than any gain
++ cmp r3, #8
++ mov r4, r3
++ ble 1f
++
++ @ Align r1 on a 32 byte boundary
++ neg r3, r0
++ ubfx r3, r3, #2, #3
++
++ cmp r3, #0
++ blne 10f
++
++ sub r3, r4, r3
++
++1:
++ vld4.8 {d16, d17, d18, d19}, [r1]
++
++1:
++ vmull.u8 q15, \sA, d7
++
++ vld4.8 {d20, d21, d22, d23}, [r0]
++
++ vsra.u16 q15, q15, #8
++ subs r3, #8
++ vrshrn.u16 d31, q15, #8
++ vsub.u8 d30, d6, d31
++
++ vmull.u8 q12, \sR, d31
++ vmull.u8 q13, \sG, d31
++ vmull.u8 q14, \sB, d31
++ addge r1, #32
++
++ vmlal.u8 q12, \dR, d30
++ vmlal.u8 q13, \dG, d30
++ vmlal.u8 q14, \dB, d30
++ vld4.8 {d16, d17, d18, d19}, [r1]
++
++ vsra.u16 q12, q12, #8 @ * 257/256
++ vsra.u16 q13, q13, #8
++ vsra.u16 q14, q14, #8
++
++ vrshrn.u16 \dR, q12, #8
++ vrshrn.u16 \dG, q13, #8
++ vrshrn.u16 \dB, q14, #8
++ vmov.u8 \dA, #0xff
++
++ vst4.8 {d20, d21, d22, d23}, [r0]!
++ bge 1b
++ add r1, #32
++
++2:
++ cmp r3, #-8
++ blgt 10f
++
++ pop { r4, pc }
++
++
++// Partial version
++// Align @ start & deal with tail
++10:
++ lsls r2, r3, #30 @ b2 -> C, b1 -> N
++ mov r2, r0
++ bcc 1f
++ vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r1]!
++ vld4.8 {d20[0], d21[0], d22[0], d23[0]}, [r2]!
++ vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1]!
++ vld4.8 {d20[1], d21[1], d22[1], d23[1]}, [r2]!
++ vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r1]!
++ vld4.8 {d20[2], d21[2], d22[2], d23[2]}, [r2]!
++ vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r1]!
++ vld4.8 {d20[3], d21[3], d22[3], d23[3]}, [r2]!
++1:
++ bpl 1f
++ vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r1]!
++ vld4.8 {d20[4], d21[4], d22[4], d23[4]}, [r2]!
++ vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r1]!
++ vld4.8 {d20[5], d21[5], d22[5], d23[5]}, [r2]!
++1:
++ tst r3, #1
++ beq 1f
++ vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r1]!
++ vld4.8 {d20[6], d21[6], d22[6], d23[6]}, [r2]!
++1:
++ @ Set conditions for later
++ lsls r2, r3, #30 @ b2 -> C, b1 -> N
++
++ vmull.u8 q15, \sA, d7
++ vsra.u16 q15, q15, #8
++ vrshrn.u16 d31, q15, #8
++ vsub.u8 d30, d6, d31
++
++ vmull.u8 q12, \sR, d31
++ vmull.u8 q13, \sG, d31
++ vmull.u8 q14, \sB, d31
++
++ vmlal.u8 q12, \dR, d30
++ vmlal.u8 q13, \dG, d30
++ vmlal.u8 q14, \dB, d30
++
++ vsra.u16 q12, q12, #8
++ vsra.u16 q13, q13, #8
++ vsra.u16 q14, q14, #8
++
++ vrshrn.u16 \dR, q12, #8
++ vrshrn.u16 \dG, q13, #8
++ vrshrn.u16 \dB, q14, #8
++ vmov.u8 \dA, #0xff
++
++ bcc 1f
++ vst4.8 {d20[0], d21[0], d22[0], d23[0]}, [r0]!
++ vst4.8 {d20[1], d21[1], d22[1], d23[1]}, [r0]!
++ vst4.8 {d20[2], d21[2], d22[2], d23[2]}, [r0]!
++ vst4.8 {d20[3], d21[3], d22[3], d23[3]}, [r0]!
++1:
++ bpl 1f
++ vst4.8 {d20[4], d21[4], d22[4], d23[4]}, [r0]!
++ vst4.8 {d20[5], d21[5], d22[5], d23[5]}, [r0]!
++1:
++ tst r3, #1
++ bxeq lr
++ vst4.8 {d20[6], d21[6], d22[6], d23[6]}, [r0]!
++
++ bx lr
++
++.endm
++
++
++@ [r0] RGBx dest (Byte order: R, G, B, x)
++@ [r1] RGBA src merge (Byte order: R, G, B, A)
++@ r2 plane alpha
++@ r3 count (pixels)
++
++@ Whilst specified as RGBx+RGBA the only important part is the position of
++@ alpha, the other components are all treated the same
++
++@ [r0] RGBx dest (Byte order: R, G, B, x)
++@ [r1] RGBA src merge (Byte order: R, G, B, A) - same as above
++@ r2 plane alpha
++@ r3 count (pixels)
++ .align 16
++ .global blend_rgbx_rgba_neon
++#ifdef __ELF__
++ .type blend_rgbx_rgba_neon, %function
++#endif
++blend_rgbx_rgba_neon:
++ blend_main d16, d17, d18, d19, d20, d21, d22, d23
++
++
++@ [r0] RGBx dest (Byte order: R, G, B, x)
++@ [r1] RGBA src merge (Byte order: B, G, R, A) - B / R swapped
++@ r2 plane alpha
++@ r3 count (pixels)
++ .align 16
++ .global blend_bgrx_rgba_neon
++#ifdef __ELF__
++ .type blend_bgrx_rgba_neon, %function
++#endif
++blend_bgrx_rgba_neon:
++ blend_main d18, d17, d16, d19, d20, d21, d22, d23
++
++
++
+--- /dev/null
++++ b/modules/hw/mmal/blend_rgba_neon.h
+@@ -0,0 +1,17 @@
++#ifndef HW_MMAL_BLEND_RGBA_NEON_H
++#define HW_MMAL_BLEND_RGBA_NEON_H
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++typedef void blend_neon_fn(void * dest, const void * src, int alpha, unsigned int n);
++extern blend_neon_fn blend_rgbx_rgba_neon;
++extern blend_neon_fn blend_bgrx_rgba_neon;
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif
++
+--- /dev/null
++++ b/modules/hw/mmal/blend_test.c
+@@ -0,0 +1,180 @@
++#include <stdio.h>
++#include <stdint.h>
++#include <memory.h>
++
++#include "blend_rgba_neon.h"
++
++#define RPI_PROFILE 1
++#define RPI_PROC_ALLOC 1
++#include "rpi_prof.h"
++
++static inline unsigned div255(unsigned v)
++{
++ // This models what we we do in the asm for / 255
++ // It generates something in the range [(i+126)/255, (i+127)/255] which is good enough
++ return ((v * 257) + 0x8000) >> 16;
++}
++
++static inline unsigned int a_merge(unsigned int dst, unsigned src, unsigned f)
++{
++ return div255((255 - f) * (dst) + src * f);
++}
++
++
++static void merge_line(void * dest, const void * src, int alpha, unsigned int n)
++{
++ unsigned int i;
++ const uint8_t * s_data = src;
++ uint8_t * d_data = dest;
++
++ for (i = 0; i != n; ++i) {
++ const uint32_t s_pel = ((const uint32_t *)s_data)[i];
++ const uint32_t d_pel = ((const uint32_t *)d_data)[i];
++ const unsigned int a = div255(alpha * (s_pel >> 24));
++ ((uint32_t *)d_data)[i] = 0xff000000 |
++ (a_merge((d_pel >> 16) & 0xff, (s_pel >> 16) & 0xff, a) << 16) |
++ (a_merge((d_pel >> 8) & 0xff, (s_pel >> 8) & 0xff, a) << 8 ) |
++ (a_merge((d_pel >> 0) & 0xff, (s_pel >> 0) & 0xff, a) << 0 );
++ }
++}
++
++
++// Merge RGBA with BGRA
++static void merge_line2(void * dest, const void * src, int alpha, unsigned int n)
++{
++ unsigned int i;
++ const uint8_t * s_data = src;
++ uint8_t * d_data = dest;
++
++ for (i = 0; i != n; ++i) {
++ const uint32_t s_pel = ((const uint32_t *)s_data)[i];
++ const uint32_t d_pel = ((const uint32_t *)d_data)[i];
++ const unsigned int a = div255(alpha * (s_pel >> 24));
++ ((uint32_t *)d_data)[i] = 0xff000000 |
++ (a_merge((d_pel >> 0) & 0xff, (s_pel >> 16) & 0xff, a) << 0 ) |
++ (a_merge((d_pel >> 8) & 0xff, (s_pel >> 8) & 0xff, a) << 8 ) |
++ (a_merge((d_pel >> 16) & 0xff, (s_pel >> 0) & 0xff, a) << 16);
++ }
++}
++
++#define BUF_SIZE 256
++#define BUF_SLACK 16
++#define BUF_ALIGN 64
++#define BUF_ALLOC (BUF_SIZE + 2*BUF_SLACK + BUF_ALIGN)
++
++static void test_line(const uint32_t * const dx, const unsigned int d_off,
++ const uint32_t * const sx, const unsigned int s_off,
++ const unsigned int alpha, const unsigned int len, const int prof_no)
++{
++ uint32_t d0_buf[BUF_ALLOC];
++ uint32_t d1_buf[BUF_ALLOC];
++ const uint32_t * const s0 = sx + s_off;
++
++ uint32_t * const d0 = (uint32_t *)(((uintptr_t)d0_buf + (BUF_ALIGN - 1)) & ~(BUF_ALIGN - 1)) + d_off;
++ uint32_t * const d1 = (uint32_t *)(((uintptr_t)d1_buf + (BUF_ALIGN - 1)) & ~(BUF_ALIGN - 1)) + d_off;
++ unsigned int i;
++
++ memcpy(d0, dx, (BUF_SIZE + BUF_SLACK*2)*4);
++ memcpy(d1, dx, (BUF_SIZE + BUF_SLACK*2)*4);
++
++ merge_line(d0 + BUF_SLACK, s0 + BUF_SLACK, alpha, len);
++
++ PROFILE_START();
++ blend_rgbx_rgba_neon(d1 + BUF_SLACK, s0 + BUF_SLACK, alpha, len);
++ PROFILE_ACC_N(prof_no);
++
++ for (i = 0; i != BUF_SIZE + BUF_SLACK*2; ++i) {
++ if (d0[i] != d1[i]) {
++ printf("%3d: %08x + %08x * %02x: %08x / %08x: len=%d\n", (int)(i - BUF_SLACK), dx[i], s0[i], alpha, d0[i], d1[i], len);
++ }
++ }
++}
++
++static void test_line2(const uint32_t * const dx, const unsigned int d_off,
++ const uint32_t * const sx, const unsigned int s_off,
++ const unsigned int alpha, const unsigned int len, const int prof_no)
++{
++ uint32_t d0_buf[BUF_ALLOC];
++ uint32_t d1_buf[BUF_ALLOC];
++ const uint32_t * const s0 = sx + s_off;
++
++ uint32_t * const d0 = (uint32_t *)(((uintptr_t)d0_buf + (BUF_ALIGN - 1)) & ~(BUF_ALIGN - 1)) + d_off;
++ uint32_t * const d1 = (uint32_t *)(((uintptr_t)d1_buf + (BUF_ALIGN - 1)) & ~(BUF_ALIGN - 1)) + d_off;
++ unsigned int i;
++
++ memcpy(d0, dx, (BUF_SIZE + BUF_SLACK*2)*4);
++ memcpy(d1, dx, (BUF_SIZE + BUF_SLACK*2)*4);
++
++ merge_line2(d0 + BUF_SLACK, s0 + BUF_SLACK, alpha, len);
++
++ PROFILE_START();
++ blend_bgrx_rgba_neon(d1 + BUF_SLACK, s0 + BUF_SLACK, alpha, len);
++ PROFILE_ACC_N(prof_no);
++
++ for (i = 0; i != BUF_SIZE + BUF_SLACK*2; ++i) {
++ if (d0[i] != d1[i]) {
++ printf("%3d: %08x + %08x * %02x: %08x / %08x: len=%d\n", (int)(i - BUF_SLACK), dx[i], s0[i], alpha, d0[i], d1[i], len);
++ }
++ }
++}
++
++
++
++int main(int argc, char *argv[])
++{
++ unsigned int i, j;
++ uint32_t d0_buf[BUF_ALLOC];
++ uint32_t s0_buf[BUF_ALLOC];
++
++ uint32_t * const d0 = (uint32_t *)(((uintptr_t)d0_buf + 63) & ~63) + 0;
++ uint32_t * const s0 = (uint32_t *)(((uintptr_t)s0_buf + 63) & ~63) + 0;
++
++ PROFILE_INIT();
++
++ for (i = 0; i != 255*255; ++i) {
++ unsigned int a = div255(i);
++ unsigned int b = (i + 127)/255;
++ unsigned int c = (i + 126)/255;
++ if (a != b && a != c)
++ printf("%d/255: %d != %d/%d\n", i, a, b, c);
++ }
++
++ for (i = 0; i != BUF_ALLOC; ++i) {
++ d0_buf[i] = 0xff00 | i;
++ s0_buf[i] = (i << 24) | 0x40ffc0;
++ }
++
++ for (i = 0; i != 256; ++i) {
++ test_line(d0, 0, s0, 0, i, 256, -1);
++ }
++ for (i = 0; i != 256; ++i) {
++ test_line(d0, 0, s0, 0, 128, i, -1);
++ }
++
++ for (j = 0; j != 16; ++j) {
++ for (i = 0; i != 256; ++i) {
++ test_line(d0, j & 3, s0, j >> 2, i, 256, j);
++ }
++ PROFILE_PRINTF_N(j);
++ PROFILE_CLEAR_N(j);
++ }
++ printf("Done 1\n");
++
++ for (i = 0; i != 256; ++i) {
++ test_line2(d0, 0, s0, 0, i, 256, -1);
++ }
++ for (i = 0; i != 256; ++i) {
++ test_line2(d0, 0, s0, 0, 128, i, -1);
++ }
++
++ for (j = 0; j != 16; ++j) {
++ for (i = 0; i != 256; ++i) {
++ test_line2(d0, j & 3, s0, j >> 2, i, 256, j);
++ }
++ PROFILE_PRINTF_N(j);
++ }
++ printf("Done 2\n");
++
++ return 0;
++}
++
+--- a/modules/hw/mmal/codec.c
++++ b/modules/hw/mmal/codec.c
+@@ -26,267 +26,443 @@
+ #include "config.h"
+ #endif
+
++#include <stdatomic.h>
++
+ #include <vlc_common.h>
+-#include <vlc_atomic.h>
+ #include <vlc_plugin.h>
+ #include <vlc_codec.h>
++#include <vlc_filter.h>
+ #include <vlc_threads.h>
+
+-#include <bcm_host.h>
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/util/mmal_util.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+
++#include <interface/vcsm/user-vcsm.h>
++
++#include "mmal_cma.h"
+ #include "mmal_picture.h"
+
++#include "subpic.h"
++#include "blend_rgba_neon.h"
++
++#define TRACE_ALL 0
++
++#define OPT_TO_FROM_ZC 0
++
+ /*
+ * This seems to be a bit high, but reducing it causes instabilities
+ */
+ #define NUM_EXTRA_BUFFERS 5
++//#define NUM_EXTRA_BUFFERS 10
+ #define NUM_DECODER_BUFFER_HEADERS 30
+
+-#define MIN_NUM_BUFFERS_IN_TRANSIT 2
++#define CONVERTER_BUFFERS 4 // Buffers on the output of the converter
++
++#define MMAL_SLICE_HEIGHT 16
++#define MMAL_ALIGN_W 32
++#define MMAL_ALIGN_H 16
+
+ #define MMAL_OPAQUE_NAME "mmal-opaque"
+ #define MMAL_OPAQUE_TEXT N_("Decode frames directly into RPI VideoCore instead of host memory.")
+ #define MMAL_OPAQUE_LONGTEXT N_("Decode frames directly into RPI VideoCore instead of host memory. This option must only be used with the MMAL video output plugin.")
+
+-static int OpenDecoder(decoder_t *dec);
+-static void CloseDecoder(decoder_t *dec);
+-
+-vlc_module_begin()
+- set_shortname(N_("MMAL decoder"))
+- set_description(N_("MMAL-based decoder plugin for Raspberry Pi"))
+- set_capability("video decoder", 90)
+- add_shortcut("mmal_decoder")
+- add_bool(MMAL_OPAQUE_NAME, true, MMAL_OPAQUE_TEXT, MMAL_OPAQUE_LONGTEXT, false)
+- set_callbacks(OpenDecoder, CloseDecoder)
+-vlc_module_end()
++#define MMAL_RESIZE_NAME "mmal-resize"
++#define MMAL_RESIZE_TEXT N_("Use mmal resizer rather than hvs.")
++#define MMAL_RESIZE_LONGTEXT N_("Use mmal resizer rather than isp. This uses less gpu memory than the ISP but is slower.")
++
++#define MMAL_ISP_NAME "mmal-isp"
++#define MMAL_ISP_TEXT N_("Use mmal isp rather than hvs.")
++#define MMAL_ISP_LONGTEXT N_("Use mmal isp rather than hvs. This may be faster but has no blend.")
+
+-struct decoder_sys_t {
+- bool opaque;
++typedef struct decoder_sys_t
++{
+ MMAL_COMPONENT_T *component;
+ MMAL_PORT_T *input;
+ MMAL_POOL_T *input_pool;
+ MMAL_PORT_T *output;
+- MMAL_POOL_T *output_pool; /* only used for non-opaque mode */
++ hw_mmal_port_pool_ref_t *ppr;
+ MMAL_ES_FORMAT_T *output_format;
+- vlc_sem_t sem;
+
++ MMAL_STATUS_T err_stream;
+ bool b_top_field_first;
+ bool b_progressive;
+
++ bool b_flushed;
++
++ vcsm_init_type_t vcsm_init_type;
++
++ // Lock to avoid pic update & allocate happenening simultainiously
++ // * We should be able to arrange life s.t. this isn't needed
++ // but while we are confused apply belt & braces
++ vlc_mutex_t pic_lock;
++
+ /* statistics */
+- int output_in_transit;
+- int input_in_transit;
+ atomic_bool started;
+-};
++} decoder_sys_t;
+
+-/* Utilities */
+-static int change_output_format(decoder_t *dec);
+-static int send_output_buffer(decoder_t *dec);
+-static void fill_output_port(decoder_t *dec);
+-
+-/* VLC decoder callback */
+-static int decode(decoder_t *dec, block_t *block);
+-static void flush_decoder(decoder_t *dec);
+-
+-/* MMAL callbacks */
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+
+-static int OpenDecoder(decoder_t *dec)
+-{
+- int ret = VLC_SUCCESS;
+- decoder_sys_t *sys;
+- MMAL_PARAMETER_UINT32_T extra_buffers;
+- MMAL_STATUS_T status;
++typedef struct supported_mmal_enc_s {
++ struct {
++ MMAL_PARAMETER_HEADER_T header;
++ MMAL_FOURCC_T encodings[64];
++ } supported;
++ int n;
++} supported_mmal_enc_t;
++
++#define SUPPORTED_MMAL_ENC_INIT \
++{ \
++ {{MMAL_PARAMETER_SUPPORTED_ENCODINGS, sizeof(((supported_mmal_enc_t *)0)->supported)}, {0}}, \
++ -1 \
++}
+
+- if (dec->fmt_in.i_codec != VLC_CODEC_MPGV &&
+- dec->fmt_in.i_codec != VLC_CODEC_H264)
+- return VLC_EGENERIC;
++static supported_mmal_enc_t supported_decode_in_enc = SUPPORTED_MMAL_ENC_INIT;
+
+- sys = calloc(1, sizeof(decoder_sys_t));
+- if (!sys) {
+- ret = VLC_ENOMEM;
+- goto out;
++static bool is_enc_supported(supported_mmal_enc_t * const support, const MMAL_FOURCC_T fcc)
++{
++ int i;
++
++ if (fcc == 0)
++ return false;
++ if (support->n == -1)
++ return true; // Unknown - say OK
++ for (i = 0; i < support->n; ++i) {
++ if (support->supported.encodings[i] == fcc)
++ return true;
+ }
+- dec->p_sys = sys;
++ return false;
++}
+
+- sys->opaque = var_InheritBool(dec, MMAL_OPAQUE_NAME);
+- bcm_host_init();
++static bool set_and_test_enc_supported(supported_mmal_enc_t * const support, MMAL_PORT_T * port, const MMAL_FOURCC_T fcc)
++{
++ if (support->n >= 0)
++ /* already done */;
++ else if (mmal_port_parameter_get(port, (MMAL_PARAMETER_HEADER_T *)&support->supported) != MMAL_SUCCESS)
++ support->n = 0;
++ else
++ support->n = (support->supported.header.size - sizeof(support->supported.header)) /
++ sizeof(support->supported.encodings[0]);
+
+- status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, &sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
+- MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++ return is_enc_supported(support, fcc);
++}
+
+- sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
+- status = mmal_port_enable(sys->component->control, control_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to enable control port %s (status=%"PRIx32" %s)",
+- sys->component->control->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++static MMAL_FOURCC_T vlc_to_mmal_es_fourcc(const unsigned int fcc)
++{
++ switch (fcc){
++ case VLC_CODEC_MJPG:
++ return MMAL_ENCODING_MJPEG;
++ case VLC_CODEC_MP1V:
++ return MMAL_ENCODING_MP1V;
++ case VLC_CODEC_MPGV:
++ case VLC_CODEC_MP2V:
++ return MMAL_ENCODING_MP2V;
++ case VLC_CODEC_H263:
++ return MMAL_ENCODING_H263;
++ case VLC_CODEC_MP4V:
++ return MMAL_ENCODING_MP4V;
++ case VLC_CODEC_H264:
++ return MMAL_ENCODING_H264;
++ case VLC_CODEC_VP6:
++ return MMAL_ENCODING_VP6;
++ case VLC_CODEC_VP8:
++ return MMAL_ENCODING_VP8;
++ case VLC_CODEC_WMV1:
++ return MMAL_ENCODING_WMV1;
++ case VLC_CODEC_WMV2:
++ return MMAL_ENCODING_WMV2;
++ case VLC_CODEC_WMV3:
++ return MMAL_ENCODING_WMV3;
++ case VLC_CODEC_VC1:
++ return MMAL_ENCODING_WVC1;
++ case VLC_CODEC_THEORA:
++ return MMAL_ENCODING_THEORA;
++ default:
++ break;
+ }
++ return 0;
++}
+
+- sys->input = sys->component->input[0];
+- sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
+- if (dec->fmt_in.i_codec == VLC_CODEC_MPGV)
+- sys->input->format->encoding = MMAL_ENCODING_MP2V;
+- else
+- sys->input->format->encoding = MMAL_ENCODING_H264;
++static MMAL_FOURCC_T pic_to_slice_mmal_fourcc(const MMAL_FOURCC_T fcc)
++{
++ switch (fcc){
++ case MMAL_ENCODING_I420:
++ return MMAL_ENCODING_I420_SLICE;
++ case MMAL_ENCODING_I422:
++ return MMAL_ENCODING_I422_SLICE;
++ case MMAL_ENCODING_ARGB:
++ return MMAL_ENCODING_ARGB_SLICE;
++ case MMAL_ENCODING_RGBA:
++ return MMAL_ENCODING_RGBA_SLICE;
++ case MMAL_ENCODING_ABGR:
++ return MMAL_ENCODING_ABGR_SLICE;
++ case MMAL_ENCODING_BGRA:
++ return MMAL_ENCODING_BGRA_SLICE;
++ case MMAL_ENCODING_RGB16:
++ return MMAL_ENCODING_RGB16_SLICE;
++ case MMAL_ENCODING_RGB24:
++ return MMAL_ENCODING_RGB24_SLICE;
++ case MMAL_ENCODING_RGB32:
++ return MMAL_ENCODING_RGB32_SLICE;
++ case MMAL_ENCODING_BGR16:
++ return MMAL_ENCODING_BGR16_SLICE;
++ case MMAL_ENCODING_BGR24:
++ return MMAL_ENCODING_BGR24_SLICE;
++ case MMAL_ENCODING_BGR32:
++ return MMAL_ENCODING_BGR32_SLICE;
++ default:
++ break;
++ }
++ return 0;
++}
+
+- if (dec->fmt_in.i_codec == VLC_CODEC_H264) {
+- if (dec->fmt_in.i_extra > 0) {
+- status = mmal_format_extradata_alloc(sys->input->format,
+- dec->fmt_in.i_extra);
+- if (status == MMAL_SUCCESS) {
+- memcpy(sys->input->format->extradata, dec->fmt_in.p_extra,
+- dec->fmt_in.i_extra);
+- sys->input->format->extradata_size = dec->fmt_in.i_extra;
+- } else {
+- msg_Err(dec, "Failed to allocate extra format data on input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- }
++#define DEBUG_SQUARES 0
++#if DEBUG_SQUARES
++static void draw_square(void * pic_buf, size_t pic_stride, unsigned int x, unsigned int y, unsigned int w, unsigned int h, uint32_t val)
++{
++ uint32_t * p = (uint32_t *)pic_buf + y * pic_stride + x;
++ unsigned int i;
++ for (i = 0; i != h; ++i) {
++ unsigned int j;
++ for (j = 0; j != w; ++j) {
++ p[j] = val;
+ }
++ p += pic_stride;
+ }
++}
++#endif
+
+- status = mmal_port_format_commit(sys->input);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++#if 0
++static inline void draw_line(void * pic_buf, size_t pic_stride, unsigned int x, unsigned int y, unsigned int len, int inc)
++{
++ uint32_t * p = (uint32_t *)pic_buf + y * pic_stride + x;
++ while (len-- != 0) {
++ *p = ~0U;
++ p += inc;
+ }
+- sys->input->buffer_size = sys->input->buffer_size_recommended;
+- sys->input->buffer_num = sys->input->buffer_num_recommended;
++}
+
+- status = mmal_port_enable(sys->input, input_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to enable input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
+
+- sys->output = sys->component->output[0];
+- sys->output->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
++static void draw_corners(void * pic_buf, size_t pic_stride, unsigned int x, unsigned int y, unsigned int w, unsigned int h)
++{
++ const unsigned int len = 20;
++ draw_line(pic_buf, pic_stride, x, y, len, 1);
++ draw_line(pic_buf, pic_stride, x, y, len, pic_stride);
++ draw_line(pic_buf, pic_stride, x + w - 1, y, len, -1);
++ draw_line(pic_buf, pic_stride, x + w - 1, y, len, pic_stride);
++ draw_line(pic_buf, pic_stride, x + w - 1, y + h - 1, len, -1);
++ draw_line(pic_buf, pic_stride, x + w - 1, y + h - 1, len, -(int)pic_stride);
++ draw_line(pic_buf, pic_stride, x, y + h - 1, len, 1);
++ draw_line(pic_buf, pic_stride, x, y + h - 1, len, -(int)pic_stride);
++}
++#endif
+
+- if (sys->opaque) {
+- extra_buffers.hdr.id = MMAL_PARAMETER_EXTRA_BUFFERS;
+- extra_buffers.hdr.size = sizeof(MMAL_PARAMETER_UINT32_T);
+- extra_buffers.value = NUM_EXTRA_BUFFERS;
+- status = mmal_port_parameter_set(sys->output, &extra_buffers.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to set MMAL_PARAMETER_EXTRA_BUFFERS on output port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++static MMAL_RATIONAL_T
++rationalize_sar(unsigned int num, unsigned int den)
++{
++ static const unsigned int primes[] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 0};
++ const unsigned int * p = primes;
+
+- msg_Dbg(dec, "Activate zero-copy for output port");
+- MMAL_PARAMETER_BOOLEAN_T zero_copy = {
+- { MMAL_PARAMETER_ZERO_COPY, sizeof(MMAL_PARAMETER_BOOLEAN_T) },
+- 1
+- };
++ // If either num or den is 0 then return a well formed "unknown"
++ if (num == 0 || den == 0) {
++ return (MMAL_RATIONAL_T){.num = 0, .den = 0};
++ }
+
+- status = mmal_port_parameter_set(sys->output, &zero_copy.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
+- sys->output->name, status, mmal_status_to_string(status));
+- goto out;
++ while (*p != 0 && num >= *p && den >= *p) {
++ if (num % *p != 0 || den % *p != 0)
++ ++p;
++ else {
++ num /= *p;
++ den /= *p;
+ }
+ }
++ return (MMAL_RATIONAL_T){.num = num, .den = den};
++}
+
+- status = mmal_port_enable(sys->output, output_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to enable output port %s (status=%"PRIx32" %s)",
+- sys->output->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++// Buffer either attached to pic or released
++static picture_t * alloc_opaque_pic(decoder_t * const dec, MMAL_BUFFER_HEADER_T * const buf)
++{
++ decoder_sys_t *const dec_sys = dec->p_sys;
+
+- status = mmal_component_enable(sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to enable component %s (status=%"PRIx32" %s)",
+- sys->component->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ vlc_mutex_lock(&dec_sys->pic_lock);
++ picture_t * const pic = decoder_NewPicture(dec);
++ vlc_mutex_unlock(&dec_sys->pic_lock);
++
++ if (pic == NULL)
++ goto fail1;
++
++ if (buf->length == 0) {
++ msg_Err(dec, "%s: Empty buffer", __func__);
++ goto fail2;
+ }
+
+- sys->input_pool = mmal_pool_create(sys->input->buffer_num, 0);
++ if ((pic->context = hw_mmal_gen_context(buf, dec_sys->ppr)) == NULL)
++ goto fail2;
+
+- if (sys->opaque) {
+- dec->fmt_out.i_codec = VLC_CODEC_MMAL_OPAQUE;
+- dec->fmt_out.video.i_chroma = VLC_CODEC_MMAL_OPAQUE;
+- } else {
+- dec->fmt_out.i_codec = VLC_CODEC_I420;
+- dec->fmt_out.video.i_chroma = VLC_CODEC_I420;
++ buf_to_pic_copy_props(pic, buf);
++
++#if TRACE_ALL
++ msg_Dbg(dec, "pic: prog=%d, tff=%d, date=%lld", pic->b_progressive, pic->b_top_field_first, (long long)pic->date);
++#endif
++
++ return pic;
++
++fail2:
++ picture_Release(pic);
++fail1:
++ // Recycle rather than release to avoid buffer starvation if NewPic fails
++ hw_mmal_port_pool_ref_recycle(dec_sys->ppr, buf);
++ return NULL;
++}
++
++static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++ decoder_t *dec = (decoder_t *)port->userdata;
++ MMAL_STATUS_T status;
++
++#if TRACE_ALL
++ msg_Dbg(dec, "<<< %s: cmd=%d, data=%p", __func__, buffer->cmd, buffer->data);
++#endif
++
++ if (buffer->cmd == MMAL_EVENT_ERROR) {
++ status = *(uint32_t *)buffer->data;
++ dec->p_sys->err_stream = status;
++ msg_Err(dec, "MMAL error %"PRIx32" \"%s\"", status,
++ mmal_status_to_string(status));
+ }
+
+- dec->pf_decode = decode;
+- dec->pf_flush = flush_decoder;
++ mmal_buffer_header_release(buffer);
++}
+
+- vlc_sem_init(&sys->sem, 0);
++static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++ block_t * const block = (block_t *)buffer->user_data;
+
+-out:
+- if (ret != VLC_SUCCESS)
+- CloseDecoder(dec);
++ (void)port; // Unused
+
+- return ret;
++#if TRACE_ALL
++ msg_Dbg((decoder_t *)port->userdata, "<<< %s: cmd=%d, data=%p, len=%d/%d, pts=%lld", __func__,
++ buffer->cmd, buffer->data, buffer->length, buffer->alloc_size, (long long)buffer->pts);
++#endif
++
++ mmal_buffer_header_reset(buffer);
++ mmal_buffer_header_release(buffer);
++
++ if (block != NULL)
++ block_Release(block);
+ }
+
+-static void CloseDecoder(decoder_t *dec)
++static void decoder_output_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+ {
+- decoder_sys_t *sys = dec->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
++ decoder_t * const dec = (decoder_t *)port->userdata;
+
+- if (!sys)
++ if (buffer->cmd == 0 && buffer->length != 0)
++ {
++#if TRACE_ALL
++ msg_Dbg(dec, "<<< %s: cmd=%d, data=%p, len=%d/%d, pts=%lld", __func__,
++ buffer->cmd, buffer->data, buffer->length, buffer->alloc_size, (long long)buffer->pts);
++#endif
++
++ picture_t *pic = alloc_opaque_pic(dec, buffer);
++#if TRACE_ALL
++ msg_Dbg(dec, "flags=%#x, video flags=%#x", buffer->flags, buffer->type->video.flags);
++#endif
++ if (pic == NULL)
++ msg_Err(dec, "Failed to allocate new picture");
++ else
++ decoder_QueueVideo(dec, pic);
++ // Buffer released or attached to pic - do not release again
+ return;
++ }
+
+- if (sys->component && sys->component->control->is_enabled)
+- mmal_port_disable(sys->component->control);
++ if (buffer->cmd == MMAL_EVENT_FORMAT_CHANGED)
++ {
++ decoder_sys_t * const sys = dec->p_sys;
++ MMAL_EVENT_FORMAT_CHANGED_T * const fmt = mmal_event_format_changed_get(buffer);
++ MMAL_ES_FORMAT_T * const format = mmal_format_alloc();
+
+- if (sys->input && sys->input->is_enabled)
+- mmal_port_disable(sys->input);
++ if (format == NULL)
++ msg_Err(dec, "Failed to allocate new format");
++ else
++ {
++ mmal_format_full_copy(format, fmt->format);
++ format->encoding = MMAL_ENCODING_OPAQUE;
+
+- if (sys->output && sys->output->is_enabled)
+- mmal_port_disable(sys->output);
++ // If no PAR in the stream - see if we've got one from the demux
++ if (format->es->video.par.den <= 0 || format->es->video.par.num <= 0) {
++ unsigned int n = dec->fmt_in.video.i_sar_num;
++ unsigned int d = dec->fmt_in.video.i_sar_den;
++
++ if (n == 0 || d == 0) {
++ // Guesswork required
++ const unsigned int w = format->es->video.width;
++ const unsigned int h = format->es->video.height;
++ if ((w == 704 || w == 720) && (h == 480 || h == 576)) {
++ // Very likely SD 4:3
++ n = w * 3;
++ d = h * 4;
++ }
++ else
++ {
++ // Otherwise guess SAR 1:1
++ n = 1;
++ d = 1;
++ }
++ }
+
+- if (sys->component && sys->component->is_enabled)
+- mmal_component_disable(sys->component);
++ format->es->video.par = rationalize_sar(n, d);
++ }
+
+- if (sys->input_pool)
+- mmal_pool_destroy(sys->input_pool);
++ if (sys->output_format != NULL)
++ mmal_format_free(sys->output_format);
+
+- if (sys->output_format)
+- mmal_format_free(sys->output_format);
++ sys->output_format = format;
++ }
++ }
++ else if (buffer->cmd != 0) {
++ char buf0[5];
++ msg_Warn(dec, "Unexpected output cb event: %s", str_fourcc(buf0, buffer->cmd));
++ }
+
+- if (sys->output_pool)
+- mmal_pool_destroy(sys->output_pool);
++ // If we get here then we were flushing (cmd == 0 && len == 0) or
++ // that was an EVENT - in either case we want to release the buffer
++ // back to its pool rather than recycle it.
++ mmal_buffer_header_reset(buffer);
++ buffer->user_data = NULL;
++ mmal_buffer_header_release(buffer);
++}
+
+- if (sys->component)
+- mmal_component_release(sys->component);
+
+- vlc_sem_destroy(&sys->sem);
+- free(sys);
+
+- bcm_host_deinit();
++static void fill_output_port(decoder_t *dec)
++{
++ decoder_sys_t *sys = dec->p_sys;
++
++ if (decoder_UpdateVideoFormat(dec) != 0)
++ {
++ // If we have a new format don't bother stuffing the buffer
++ // We should get a reset RSN
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: Updated", __func__);
++#endif
++
++ return;
++ }
++
++ hw_mmal_port_pool_ref_fill(sys->ppr);
++ return;
+ }
+
+ static int change_output_format(decoder_t *dec)
+ {
+ MMAL_PARAMETER_VIDEO_INTERLACE_TYPE_T interlace_type;
+- decoder_sys_t *sys = dec->p_sys;
++ decoder_sys_t * const sys = dec->p_sys;
+ MMAL_STATUS_T status;
+- int pool_size;
+ int ret = 0;
+
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: <<<", __func__);
++#endif
++
+ if (atomic_load(&sys->started)) {
+ mmal_format_full_copy(sys->output->format, sys->output_format);
+ status = mmal_port_format_commit(sys->output);
+@@ -300,7 +476,9 @@ static int change_output_format(decoder_
+ }
+
+ port_reset:
++#if TRACE_ALL
+ msg_Dbg(dec, "%s: Do full port reset", __func__);
++#endif
+ status = mmal_port_disable(sys->output);
+ if (status != MMAL_SUCCESS) {
+ msg_Err(dec, "Failed to disable output port (status=%"PRIx32" %s)",
+@@ -310,6 +488,7 @@ port_reset:
+ }
+
+ mmal_format_full_copy(sys->output->format, sys->output_format);
++
+ status = mmal_port_format_commit(sys->output);
+ if (status != MMAL_SUCCESS) {
+ msg_Err(dec, "Failed to commit output format (status=%"PRIx32" %s)",
+@@ -318,18 +497,10 @@ port_reset:
+ goto out;
+ }
+
+- if (sys->opaque) {
+- sys->output->buffer_num = NUM_DECODER_BUFFER_HEADERS;
+- pool_size = NUM_DECODER_BUFFER_HEADERS;
+- } else {
+- sys->output->buffer_num = __MAX(sys->output->buffer_num_recommended,
+- MIN_NUM_BUFFERS_IN_TRANSIT);
+- pool_size = sys->output->buffer_num;
+- }
+-
++ sys->output->buffer_num = NUM_DECODER_BUFFER_HEADERS;
+ sys->output->buffer_size = sys->output->buffer_size_recommended;
+
+- status = mmal_port_enable(sys->output, output_port_cb);
++ status = mmal_port_enable(sys->output, decoder_output_cb);
+ if (status != MMAL_SUCCESS) {
+ msg_Err(dec, "Failed to enable output port (status=%"PRIx32" %s)",
+ status, mmal_status_to_string(status));
+@@ -338,25 +509,14 @@ port_reset:
+ }
+
+ if (!atomic_load(&sys->started)) {
+- if (!sys->opaque) {
+- sys->output_pool = mmal_port_pool_create(sys->output, pool_size, 0);
+- msg_Dbg(dec, "Created output pool with %d pictures", sys->output_pool->headers_num);
+- }
+-
+ atomic_store(&sys->started, true);
+
+ /* we need one picture from vout for each buffer header on the output
+ * port */
+- dec->i_extra_picture_buffers = pool_size;
+-
+- /* remove what VLC core reserves as it is part of the pool_size
+- * already */
+- if (dec->fmt_in.i_codec == VLC_CODEC_H264)
+- dec->i_extra_picture_buffers -= 19;
+- else
+- dec->i_extra_picture_buffers -= 3;
+-
++ dec->i_extra_picture_buffers = 10;
++#if TRACE_ALL
+ msg_Dbg(dec, "Request %d extra pictures", dec->i_extra_picture_buffers);
++#endif
+ }
+
+ apply_fmt:
+@@ -366,8 +526,8 @@ apply_fmt:
+ dec->fmt_out.video.i_y_offset = sys->output->format->es->video.crop.y;
+ dec->fmt_out.video.i_visible_width = sys->output->format->es->video.crop.width;
+ dec->fmt_out.video.i_visible_height = sys->output->format->es->video.crop.height;
+- dec->fmt_out.video.i_sar_num = sys->output->format->es->video.par.num;
+- dec->fmt_out.video.i_sar_den = sys->output->format->es->video.par.den;
++ dec->fmt_out.video.i_sar_num = sys->output_format->es->video.par.num; // SAR can be killed by commit
++ dec->fmt_out.video.i_sar_den = sys->output_format->es->video.par.den;
+ dec->fmt_out.video.i_frame_rate = sys->output->format->es->video.frame_rate.num;
+ dec->fmt_out.video.i_frame_rate_base = sys->output->format->es->video.frame_rate.den;
+
+@@ -382,12 +542,19 @@ apply_fmt:
+ sys->b_progressive = (interlace_type.eMode == MMAL_InterlaceProgressive);
+ sys->b_top_field_first = sys->b_progressive ? true :
+ (interlace_type.eMode == MMAL_InterlaceFieldsInterleavedUpperFirst);
++#if TRACE_ALL
+ msg_Dbg(dec, "Detected %s%s video (%d)",
+ sys->b_progressive ? "progressive" : "interlaced",
+ sys->b_progressive ? "" : (sys->b_top_field_first ? " tff" : " bff"),
+ interlace_type.eMode);
++#endif
+ }
+
++ // Tell the rest of the world we have changed format
++ vlc_mutex_lock(&sys->pic_lock);
++ ret = decoder_UpdateVideoFormat(dec);
++ vlc_mutex_unlock(&sys->pic_lock);
++
+ out:
+ mmal_format_free(sys->output_format);
+ sys->output_format = NULL;
+@@ -395,144 +562,85 @@ out:
+ return ret;
+ }
+
+-static int send_output_buffer(decoder_t *dec)
++static MMAL_STATUS_T
++set_extradata_and_commit(decoder_t * const dec, decoder_sys_t * const sys)
+ {
+- decoder_sys_t *sys = dec->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
+- picture_sys_t *p_sys;
+- picture_t *picture = NULL;
+ MMAL_STATUS_T status;
+- unsigned buffer_size = 0;
+- int ret = 0;
+
+- if (!sys->output->is_enabled)
+- return VLC_EGENERIC;
+-
+- /* If local output pool is allocated, use it - this is only the case for
+- * non-opaque modes */
+- if (sys->output_pool) {
+- buffer = mmal_queue_get(sys->output_pool->queue);
+- if (!buffer) {
+- msg_Warn(dec, "Failed to get new buffer");
+- return VLC_EGENERIC;
+- }
+- }
+-
+- if (!decoder_UpdateVideoFormat(dec))
+- picture = decoder_NewPicture(dec);
+- if (!picture) {
+- msg_Warn(dec, "Failed to get new picture");
+- ret = -1;
+- goto err;
+- }
+-
+- p_sys = picture->p_sys;
+- for (int i = 0; i < picture->i_planes; i++)
+- buffer_size += picture->p[i].i_lines * picture->p[i].i_pitch;
+-
+- if (sys->output_pool) {
+- mmal_buffer_header_reset(buffer);
+- buffer->alloc_size = sys->output->buffer_size;
+- if (buffer_size < sys->output->buffer_size) {
+- msg_Err(dec, "Retrieved picture with too small data block (%d < %d)",
+- buffer_size, sys->output->buffer_size);
+- ret = VLC_EGENERIC;
+- goto err;
+- }
+-
+- if (!sys->opaque)
+- buffer->data = picture->p[0].p_pixels;
+- } else {
+- buffer = p_sys->buffer;
+- if (!buffer) {
+- msg_Warn(dec, "Picture has no buffer attached");
+- picture_Release(picture);
+- return VLC_EGENERIC;
+- }
+- buffer->data = p_sys->buffer->data;
+- }
+- buffer->user_data = picture;
+- buffer->cmd = 0;
+-
+- status = mmal_port_send_buffer(sys->output, buffer);
++ status = mmal_port_format_commit(sys->input);
+ if (status != MMAL_SUCCESS) {
+- msg_Err(dec, "Failed to send buffer to output port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- ret = -1;
+- goto err;
+- }
+- atomic_fetch_add(&sys->output_in_transit, 1);
+-
+- return ret;
+-
+-err:
+- if (picture)
+- picture_Release(picture);
+- if (sys->output_pool && buffer) {
+- buffer->data = NULL;
+- mmal_buffer_header_release(buffer);
++ msg_Err(dec, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
+ }
+- return ret;
++ return status;
+ }
+
+-static void fill_output_port(decoder_t *dec)
++static MMAL_STATUS_T decoder_send_extradata(decoder_t * const dec, decoder_sys_t *const sys)
+ {
+- decoder_sys_t *sys = dec->p_sys;
+-
+- unsigned max_buffers_in_transit = 0;
+- int buffers_available = 0;
+- int buffers_to_send = 0;
+- int i;
++ if (dec->fmt_in.i_codec == VLC_CODEC_H264 &&
++ dec->fmt_in.i_extra > 0)
++ {
++ MMAL_BUFFER_HEADER_T * const buf = mmal_queue_wait(sys->input_pool->queue);
++ MMAL_STATUS_T status;
++
++ mmal_buffer_header_reset(buf);
++ buf->cmd = 0;
++ buf->user_data = NULL;
++ buf->alloc_size = sys->input->buffer_size;
++ buf->length = dec->fmt_in.i_extra;
++ buf->data = dec->fmt_in.p_extra;
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_CONFIG;
+
+- if (sys->output_pool) {
+- max_buffers_in_transit = __MAX(sys->output_pool->headers_num,
+- MIN_NUM_BUFFERS_IN_TRANSIT);
+- buffers_available = mmal_queue_length(sys->output_pool->queue);
+- } else {
+- max_buffers_in_transit = NUM_DECODER_BUFFER_HEADERS;
+- buffers_available = NUM_DECODER_BUFFER_HEADERS - atomic_load(&sys->output_in_transit);
++ status = mmal_port_send_buffer(sys->input, buf);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to send extradata buffer to input port (status=%"PRIx32" %s)",
++ status, mmal_status_to_string(status));
++ return status;
++ }
+ }
+- buffers_to_send = max_buffers_in_transit - atomic_load(&sys->output_in_transit);
+
+- if (buffers_to_send > buffers_available)
+- buffers_to_send = buffers_available;
+-
+-#ifndef NDEBUG
+- msg_Dbg(dec, "Send %d buffers to output port (available: %d, "
+- "in_transit: %d, buffer_num: %d)",
+- buffers_to_send, buffers_available,
+- atomic_load(&sys->output_in_transit),
+- sys->output->buffer_num);
+-#endif
+- for (i = 0; i < buffers_to_send; ++i)
+- if (send_output_buffer(dec) < 0)
+- break;
++ return MMAL_SUCCESS;
+ }
+
+ static void flush_decoder(decoder_t *dec)
+ {
+- decoder_sys_t *sys = dec->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
+- MMAL_STATUS_T status;
++ decoder_sys_t *const sys = dec->p_sys;
+
+- msg_Dbg(dec, "Flushing decoder ports...");
+- mmal_port_flush(sys->output);
+- mmal_port_flush(sys->input);
+-
+- while (atomic_load(&sys->output_in_transit) ||
+- atomic_load(&sys->input_in_transit))
+- vlc_sem_wait(&sys->sem);
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: <<<", __func__);
++#endif
++
++ if (!sys->b_flushed) {
++ mmal_port_disable(sys->input);
++ mmal_port_disable(sys->output);
++ // We can leave the input disabled, but we want the output enabled
++ // in order to sink any buffers returning from other modules
++ mmal_port_enable(sys->output, decoder_output_cb);
++ sys->b_flushed = true;
++ }
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: >>>", __func__);
++#endif
+ }
+
+ static int decode(decoder_t *dec, block_t *block)
+ {
+ decoder_sys_t *sys = dec->p_sys;
+ MMAL_BUFFER_HEADER_T *buffer;
+- bool need_flush = false;
+ uint32_t len;
+- uint32_t flags = 0;
++ uint32_t flags = MMAL_BUFFER_HEADER_FLAG_FRAME_START;
+ MMAL_STATUS_T status;
+
++#if TRACE_ALL
++ msg_Dbg(dec, "<<< %s: %lld/%lld", __func__, block == NULL ? -1LL : block->i_dts, block == NULL ? -1LL : block->i_pts);
++#endif
++
++ if (sys->err_stream != MMAL_SUCCESS) {
++ msg_Err(dec, "MMAL error reported by ctrl");
++ flush_decoder(dec);
++ return VLCDEC_ECRITICAL; /// I think they are all fatal
++ }
++
+ /*
+ * Configure output port if necessary
+ */
+@@ -541,18 +649,50 @@ static int decode(decoder_t *dec, block_
+ msg_Err(dec, "Failed to change output port format");
+ }
+
+- if (!block)
+- goto out;
++ if (block == NULL)
++ return VLCDEC_SUCCESS;
+
+ /*
+ * Check whether full flush is required
+ */
+- if (block && block->i_flags & BLOCK_FLAG_DISCONTINUITY) {
++ if (block->i_flags & BLOCK_FLAG_DISCONTINUITY) {
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: >>> Discontinuity", __func__);
++#endif
+ flush_decoder(dec);
++ }
++
++ if (block->i_buffer == 0)
++ {
+ block_Release(block);
+ return VLCDEC_SUCCESS;
+ }
+
++ // Reenable stuff if the last thing we did was flush
++ if (!sys->output->is_enabled &&
++ (status = mmal_port_enable(sys->output, decoder_output_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(dec, "Output port enable failed");
++ goto fail;
++ }
++
++ if (!sys->input->is_enabled)
++ {
++ if ((status = set_extradata_and_commit(dec, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ if ((status = mmal_port_enable(sys->input, input_port_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(dec, "Input port enable failed");
++ goto fail;
++ }
++
++ if ((status = decoder_send_extradata(dec, sys)) != MMAL_SUCCESS)
++ goto fail;
++ }
++
++ // *** We cannot get a picture to put the result in 'till we have
++ // reported the size & the output stages have been set up
+ if (atomic_load(&sys->started))
+ fill_output_port(dec);
+
+@@ -563,18 +703,21 @@ static int decode(decoder_t *dec, block_
+ if (block->i_flags & BLOCK_FLAG_CORRUPTED)
+ flags |= MMAL_BUFFER_HEADER_FLAG_CORRUPTED;
+
+- while (block && block->i_buffer > 0) {
+- buffer = mmal_queue_timedwait(sys->input_pool->queue, 100);
++ while (block != NULL)
++ {
++ buffer = mmal_queue_wait(sys->input_pool->queue);
+ if (!buffer) {
+ msg_Err(dec, "Failed to retrieve buffer header for input data");
+- need_flush = true;
+- break;
++ goto fail;
+ }
++
+ mmal_buffer_header_reset(buffer);
+ buffer->cmd = 0;
+- buffer->pts = block->i_pts != 0 ? block->i_pts : block->i_dts;
++ buffer->pts = block->i_pts != VLC_TICK_INVALID ? block->i_pts :
++ block->i_dts != VLC_TICK_INVALID ? block->i_dts : MMAL_TIME_UNKNOWN;
+ buffer->dts = block->i_dts;
+ buffer->alloc_size = sys->input->buffer_size;
++ buffer->user_data = NULL;
+
+ len = block->i_buffer;
+ if (len > buffer->alloc_size)
+@@ -585,94 +728,1808 @@ static int decode(decoder_t *dec, block_
+ block->i_buffer -= len;
+ buffer->length = len;
+ if (block->i_buffer == 0) {
++ flags |= MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++ if (block->i_flags & BLOCK_FLAG_END_OF_SEQUENCE) {
++ msg_Dbg(dec, "EOS sent");
++ flags |= MMAL_BUFFER_HEADER_FLAG_EOS;
++ }
+ buffer->user_data = block;
+ block = NULL;
+ }
+ buffer->flags = flags;
+
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: -- Send buffer: cmd=%d, data=%p, size=%d, len=%d, offset=%d, flags=%#x, pts=%lld, dts=%lld", __func__,\
++ buffer->cmd, buffer->data, buffer->alloc_size, buffer->length, buffer->offset,
++ buffer->flags, (long long)buffer->pts, (long long)buffer->dts);
++#endif
+ status = mmal_port_send_buffer(sys->input, buffer);
+ if (status != MMAL_SUCCESS) {
+ msg_Err(dec, "Failed to send buffer to input port (status=%"PRIx32" %s)",
+ status, mmal_status_to_string(status));
+- break;
++ goto fail;
+ }
+- atomic_fetch_add(&sys->input_in_transit, 1);
++
++ // Reset flushed flag once we have sent a buf
++ sys->b_flushed = false;
++ flags &= ~MMAL_BUFFER_HEADER_FLAG_FRAME_START;
+ }
++ return VLCDEC_SUCCESS;
+
+-out:
+- if (need_flush)
+- flush_decoder(dec);
++fail:
++ flush_decoder(dec);
++ return VLCDEC_ECRITICAL;
+
+- return VLCDEC_SUCCESS;
+ }
+
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++
++static void CloseDecoder(decoder_t *dec)
+ {
+- decoder_t *dec = (decoder_t *)port->userdata;
++ decoder_sys_t *sys = dec->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(dec, "%s: <<<", __func__);
++#endif
++
++ if (!sys)
++ return;
++
++ if (sys->component != NULL) {
++ if (sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++
++ if (sys->output->is_enabled)
++ mmal_port_disable(sys->output);
++
++ if (sys->component->control->is_enabled)
++ mmal_port_disable(sys->component->control);
++
++ if (sys->component->is_enabled)
++ mmal_component_disable(sys->component);
++
++ mmal_component_release(sys->component);
++ }
++
++ if (sys->input_pool != NULL)
++ mmal_pool_destroy(sys->input_pool);
++
++ if (sys->output_format != NULL)
++ mmal_format_free(sys->output_format);
++
++ hw_mmal_port_pool_ref_release(sys->ppr, false);
++
++ cma_vcsm_exit(sys->vcsm_init_type);
++
++ vlc_mutex_destroy(&sys->pic_lock);
++ free(sys);
++}
++
++static int OpenDecoder(decoder_t *dec)
++{
++ int ret = VLC_EGENERIC;
++ decoder_sys_t *sys;
+ MMAL_STATUS_T status;
++ const MMAL_FOURCC_T in_fcc = vlc_to_mmal_es_fourcc(dec->fmt_in.i_codec);
++
++#if TRACE_ALL || 1
++ {
++ char buf1[5], buf2[5], buf2a[5];
++ char buf3[5], buf4[5];
++ MMAL_RATIONAL_T r = rationalize_sar(dec->fmt_in.video.i_sar_num, dec->fmt_in.video.i_sar_den);
++
++ msg_Dbg(dec, "%s: <<< (%s/%s)[%s] %dx%d %d/%d=%d/%d o:%#x -> (%s/%s) %dx%d %d/%d o:%#x", __func__,
++ str_fourcc(buf1, dec->fmt_in.i_codec),
++ str_fourcc(buf2, dec->fmt_in.video.i_chroma),
++ str_fourcc(buf2a, in_fcc),
++ dec->fmt_in.video.i_width, dec->fmt_in.video.i_height,
++ dec->fmt_in.video.i_sar_num, dec->fmt_in.video.i_sar_den,
++ r.num, r.den,
++ (int)dec->fmt_in.video.orientation,
++ str_fourcc(buf3, dec->fmt_out.i_codec),
++ str_fourcc(buf4, dec->fmt_out.video.i_chroma),
++ dec->fmt_out.video.i_width, dec->fmt_out.video.i_height,
++ dec->fmt_out.video.i_sar_num, dec->fmt_out.video.i_sar_den,
++ (int)dec->fmt_out.video.orientation);
++ }
++#endif
++
++ if (!is_enc_supported(&supported_decode_in_enc, in_fcc))
++ return VLC_EGENERIC;
++
++ sys = calloc(1, sizeof(decoder_sys_t));
++ if (!sys) {
++ ret = VLC_ENOMEM;
++ goto fail;
++ }
++ dec->p_sys = sys;
++ vlc_mutex_init(&sys->pic_lock);
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(dec, "VCSM init failed");
++ goto fail;
++ }
++ msg_Info(dec, "VCSM init succeeded: %s", cma_vcsm_init_str(sys->vcsm_init_type));
++
++ sys->err_stream = MMAL_SUCCESS;
++
++ status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, &sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ sys->input = sys->component->input[0];
++ sys->output = sys->component->output[0];
++
++ sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
++ sys->input->format->encoding = in_fcc;
++
++ if (!set_and_test_enc_supported(&supported_decode_in_enc, sys->input, in_fcc)) {
++#if TRACE_ALL
++ char cbuf[5];
++ msg_Dbg(dec, "Format not supported: %s", str_fourcc(cbuf, in_fcc));
++#endif
++ goto fail;
++ }
++
++ sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)dec;
++ status = mmal_port_enable(sys->component->control, control_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to enable control port %s (status=%"PRIx32" %s)",
++ sys->component->control->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if ((status = set_extradata_and_commit(dec, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++ sys->input->buffer_num = sys->input->buffer_num_recommended;
++
++ status = mmal_port_enable(sys->input, input_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to enable input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ // Set vanishingly unlikely shape (or at least crop)
++ // to ensure that we get a resolution changed event
++ // Small wxh are rejected (128x128 is rejected) so pick a
++ // plausible size.
++ // Crop doesn't seem to be checked for being constrained by wxh
++ // so we could place it outside the pic to be sure that it is
++ // never matched but stick with something legal in case it is ever
++ // actually checked
++ sys->output->format->es->video.height = 256;
++ sys->output->format->es->video.width = 256;
++ sys->output->format->es->video.crop.height = 4;
++ sys->output->format->es->video.crop.width = 2;
++ sys->output->format->es->video.crop.x = 66;
++ sys->output->format->es->video.crop.y = 88;
++
++ if ((status = hw_mmal_opaque_output(VLC_OBJECT(dec), &sys->ppr,
++ sys->output, NUM_EXTRA_BUFFERS, decoder_output_cb)) != MMAL_SUCCESS)
++ goto fail;
++
++ status = mmal_component_enable(sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(dec, "Failed to enable component %s (status=%"PRIx32" %s)",
++ sys->component->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if ((sys->input_pool = mmal_pool_create(sys->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(dec, "Failed to create input pool");
++ goto fail;
++ }
++
++ sys->b_flushed = true;
++
++ if ((status = decoder_send_extradata(dec, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ // Given no better ideas at this point copy input format to output
++ // This also copies container stuff (such as orientation) that we do not
++ // decode from the ES but may be important to display
++ video_format_Copy(&dec->fmt_out.video, &dec->fmt_in.video);
++ dec->fmt_out.i_codec = VLC_CODEC_MMAL_OPAQUE;
++ dec->fmt_out.video.i_chroma = VLC_CODEC_MMAL_OPAQUE;
++
++
++ dec->pf_decode = decode;
++ dec->pf_flush = flush_decoder;
++
++#if TRACE_ALL
++ msg_Dbg(dec, ">>> %s: ok", __func__);
++#endif
++ return 0;
++
++fail:
++ CloseDecoder(dec);
++#if TRACE_ALL
++msg_Dbg(dec, ">>> %s: FAIL: ret=%d", __func__, ret);
++#endif
++ return ret;
++}
++
++// ----------------------------
++
++#define CONV_MAX_LATENCY 1 // In frames
++
++typedef struct pic_fifo_s {
++ picture_t * head;
++ picture_t * tail;
++} pic_fifo_t;
++
++static inline picture_t * pic_fifo_get(pic_fifo_t * const pf)
++{
++ picture_t * const pic = pf->head;;
++ if (pic != NULL) {
++ pf->head = pic->p_next;
++ pic->p_next = NULL;
++ }
++ return pic;
++}
++
++static inline picture_t * pic_fifo_get_all(pic_fifo_t * const pf)
++{
++ picture_t * const pic = pf->head;;
++ pf->head = NULL;
++ return pic;
++}
++
++static inline void pic_fifo_release_all(pic_fifo_t * const pf)
++{
++ picture_t * pic;
++ while ((pic = pic_fifo_get(pf)) != NULL) {
++ picture_Release(pic);
++ }
++}
++
++static inline void pic_fifo_init(pic_fifo_t * const pf)
++{
++ pf->head = NULL;
++ pf->tail = NULL; // Not strictly needed
++}
++
++static inline void pic_fifo_put(pic_fifo_t * const pf, picture_t * pic)
++{
++ pic->p_next = NULL;
++ if (pf->head == NULL)
++ pf->head = pic;
++ else
++ pf->tail->p_next = pic;
++ pf->tail = pic;
++}
++
++#define SUBS_MAX 3
++
++typedef enum filter_resizer_e {
++ FILTER_RESIZER_RESIZER,
++ FILTER_RESIZER_ISP,
++ FILTER_RESIZER_HVS
++} filter_resizer_t;
++
++typedef struct conv_frame_stash_s
++{
++ mtime_t pts;
++ MMAL_BUFFER_HEADER_T * sub_bufs[SUBS_MAX];
++} conv_frame_stash_t;
++
++typedef struct filter_sys_t {
++ filter_resizer_t resizer_type;
++ MMAL_COMPONENT_T *component;
++ MMAL_PORT_T *input;
++ MMAL_PORT_T *output;
++ MMAL_POOL_T *out_pool; // Free output buffers
++ MMAL_POOL_T *in_pool; // Input pool to get BH for replication
++
++ cma_buf_pool_t * cma_in_pool;
++ cma_buf_pool_t * cma_out_pool;
++
++ subpic_reg_stash_t subs[SUBS_MAX];
++
++ pic_fifo_t ret_pics;
++
++ unsigned int pic_n;
++ vlc_sem_t sem;
++ vlc_mutex_t lock;
++
++ MMAL_STATUS_T err_stream;
++
++ bool needs_copy_in;
++ bool is_cma;
++ bool is_sliced;
++ bool out_fmt_set;
++ const char * component_name;
++ MMAL_PORT_BH_CB_T in_port_cb_fn;
++ MMAL_PORT_BH_CB_T out_port_cb_fn;
++
++ uint64_t frame_seq;
++ conv_frame_stash_t stash[16];
++
++ // Slice specific tracking stuff
++ struct {
++ pic_fifo_t pics;
++ unsigned int line; // Lines filled
++ } slice;
++
++ vcsm_init_type_t vcsm_init_type;
++} filter_sys_t;
++
++
++static MMAL_STATUS_T pic_to_format(MMAL_ES_FORMAT_T * const es_fmt, const picture_t * const pic)
++{
++ unsigned int bpp = (pic->format.i_bits_per_pixel + 7) >> 3;
++ MMAL_VIDEO_FORMAT_T * const v_fmt = &es_fmt->es->video;
++
++ es_fmt->type = MMAL_ES_TYPE_VIDEO;
++ es_fmt->encoding = vlc_to_mmal_video_fourcc(&pic->format);
++ es_fmt->encoding_variant = 0;
++
++ // Fill in crop etc.
++ hw_mmal_vlc_fmt_to_mmal_fmt(es_fmt, &pic->format);
++ // Override width / height with strides if appropriate
++ if (bpp != 0) {
++ v_fmt->width = pic->p[0].i_pitch / bpp;
++ v_fmt->height = pic->p[0].i_lines;
++ }
++ return MMAL_SUCCESS;
++}
++
++
++static MMAL_STATUS_T conv_enable_in(filter_t * const p_filter, filter_sys_t * const sys)
++{
++ MMAL_STATUS_T err = MMAL_SUCCESS;
++
++ if (!sys->input->is_enabled &&
++ (err = mmal_port_enable(sys->input, sys->in_port_cb_fn)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Failed to enable input port %s (status=%"PRIx32" %s)",
++ sys->input->name, err, mmal_status_to_string(err));
++ }
++ return err;
++}
++
++static MMAL_STATUS_T conv_enable_out(filter_t * const p_filter, filter_sys_t * const sys)
++{
++ MMAL_STATUS_T err = MMAL_SUCCESS;
++
++ if (sys->is_cma)
++ {
++ if (sys->cma_out_pool == NULL &&
++ (sys->cma_out_pool = cma_buf_pool_new(CONVERTER_BUFFERS, CONVERTER_BUFFERS, true, "mmal_resizer")) == NULL)
++ {
++ msg_Err(p_filter, "Failed to alloc cma buf pool");
++ return MMAL_ENOMEM;
++ }
++ }
++ else
++ {
++ cma_buf_pool_deletez(&sys->cma_out_pool);
++ }
++
++ if (!sys->output->is_enabled &&
++ (err = mmal_port_enable(sys->output, sys->out_port_cb_fn)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Failed to enable output port %s (status=%"PRIx32" %s)",
++ sys->output->name, err, mmal_status_to_string(err));
++ }
++ return err;
++}
++
++static void conv_control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++ filter_t * const p_filter = (filter_t *)port->userdata;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s: <<< cmd=%d, data=%p, pic=%p", __func__, buffer->cmd, buffer->data, buffer->user_data);
++#endif
+
+ if (buffer->cmd == MMAL_EVENT_ERROR) {
+- status = *(uint32_t *)buffer->data;
+- msg_Err(dec, "MMAL error %"PRIx32" \"%s\"", status,
++ MMAL_STATUS_T status = *(uint32_t *)buffer->data;
++
++ p_filter->p_sys->err_stream = status;
++
++ msg_Err(p_filter, "MMAL error %"PRIx32" \"%s\"", status,
+ mmal_status_to_string(status));
+ }
+
+ mmal_buffer_header_release(buffer);
+ }
+
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++static void conv_input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
+ {
+- block_t *block = (block_t *)buffer->user_data;
+- decoder_t *dec = (decoder_t *)port->userdata;
+- decoder_sys_t *sys = dec->p_sys;
+- buffer->user_data = NULL;
++#if TRACE_ALL
++ picture_context_t * ctx = buf->user_data;
++// filter_sys_t *const sys = ((filter_t *)port->userdata)->p_sys;
++
++ msg_Dbg((filter_t *)port->userdata, "<<< %s cmd=%d, ctx=%p, buf=%p, flags=%#x, len=%d/%d, pts=%lld",
++ __func__, buf->cmd, ctx, buf, buf->flags, buf->length, buf->alloc_size, (long long)buf->pts);
++#else
++ VLC_UNUSED(port);
++#endif
++
++ mmal_buffer_header_release(buf);
++
++#if TRACE_ALL
++ msg_Dbg((filter_t *)port->userdata, ">>> %s", __func__);
++#endif
++}
++
++static void conv_out_q_pic(filter_sys_t * const sys, picture_t * const pic)
++{
++ pic->p_next = NULL;
++
++ vlc_mutex_lock(&sys->lock);
++ pic_fifo_put(&sys->ret_pics, pic);
++ vlc_mutex_unlock(&sys->lock);
+
+- mmal_buffer_header_release(buffer);
+- if (block)
+- block_Release(block);
+- atomic_fetch_sub(&sys->input_in_transit, 1);
+ vlc_sem_post(&sys->sem);
+ }
+
+-static void output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++static void conv_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
+ {
+- decoder_t *dec = (decoder_t *)port->userdata;
+- decoder_sys_t *sys = dec->p_sys;
+- picture_t *picture;
+- MMAL_EVENT_FORMAT_CHANGED_T *fmt;
+- MMAL_ES_FORMAT_T *format;
+-
+- if (buffer->cmd == 0) {
+- picture = (picture_t *)buffer->user_data;
+- if (buffer->length > 0) {
+- picture->date = buffer->pts;
+- picture->b_progressive = sys->b_progressive;
+- picture->b_top_field_first = sys->b_top_field_first;
+- decoder_QueueVideo(dec, picture);
+- } else {
+- picture_Release(picture);
+- if (sys->output_pool) {
+- buffer->user_data = NULL;
+- buffer->alloc_size = 0;
+- buffer->data = NULL;
+- mmal_buffer_header_release(buffer);
+- }
+- }
+- atomic_fetch_sub(&sys->output_in_transit, 1);
+- vlc_sem_post(&sys->sem);
+- } else if (buffer->cmd == MMAL_EVENT_FORMAT_CHANGED) {
+- fmt = mmal_event_format_changed_get(buffer);
++ filter_t * const p_filter = (filter_t *)port->userdata;
++ filter_sys_t * const sys = p_filter->p_sys;
+
+- format = mmal_format_alloc();
+- mmal_format_full_copy(format, fmt->format);
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s: cmd=%d, flags=%#x, pic=%p, data=%p, len=%d/%d, pts=%lld/%lld", __func__,
++ buf->cmd, buf->flags, buf->user_data, buf->data, buf->length, buf->alloc_size,
++ (long long)buf->pts, (long long)sys->stash[(unsigned int)(buf->pts & 0xf)].pts);
++#endif
++ if (buf->cmd == 0) {
++ picture_t * const pic = (picture_t *)buf->user_data;
+
+- if (sys->opaque)
+- format->encoding = MMAL_ENCODING_OPAQUE;
++ if (pic == NULL) {
++ msg_Err(p_filter, "%s: Buffer has no attached picture", __func__);
++ }
++ else if (buf->data == NULL || buf->length == 0)
++ {
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s: Buffer has no data", __func__);
++#endif
++ }
++ else
++ {
++ buf_to_pic_copy_props(pic, buf);
++
++ // Set pic data pointers from buf aux info now it has it
++ if (sys->is_cma) {
++ if (cma_pic_set_data(pic, sys->output->format, buf) != VLC_SUCCESS)
++ msg_Err(p_filter, "Failed to set data");
++ }
++
++// draw_corners(pic->p[0].p_pixels, pic->p[0].i_pitch / 4, 0, 0, pic->p[0].i_visible_pitch / 4, pic->p[0].i_visible_lines);
++#if DEBUG_SQUARES
++ draw_square(pic->p[0].p_pixels, pic->p[0].i_pitch / 4, 0, 0, 32, 32, 0xffff0000);
++ draw_square(pic->p[0].p_pixels, pic->p[0].i_pitch / 4, 32, 0, 32, 32, 0xff00ff00);
++ draw_square(pic->p[0].p_pixels, pic->p[0].i_pitch / 4, 64, 0, 32, 32, 0xff0000ff);
++#endif
++
++ buf->user_data = NULL; // Responsability for this pic no longer with buffer
++ conv_out_q_pic(sys, pic);
++ }
++ }
++
++ mmal_buffer_header_release(buf);
++}
++
++
++static void slice_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++ filter_t * const p_filter = (filter_t *)port->userdata;
++ filter_sys_t * const sys = p_filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s: cmd=%d, flags=%#x, pic=%p, data=%p, len=%d/%d, pts=%lld", __func__,
++ buf->cmd, buf->flags, buf->user_data, buf->data, buf->length, buf->alloc_size, (long long)buf->pts);
++#endif
++
++ if (buf->cmd != 0)
++ {
++ mmal_buffer_header_release(buf);
++ return;
++ }
++
++ if (buf->data == NULL || buf->length == 0)
++ {
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s: Buffer has no data", __func__);
++#endif
++ }
++ else
++ {
++ // Got slice
++ picture_t *pic = sys->slice.pics.head;
++ const unsigned int scale_lines = sys->output->format->es->video.height; // Expected lines of callback
++
++ if (pic == NULL) {
++ msg_Err(p_filter, "No output picture");
++ goto fail;
++ }
++
++ // Copy lines
++ // * single plane only - fix for I420
++ {
++ const unsigned int scale_n = __MIN(scale_lines - sys->slice.line, MMAL_SLICE_HEIGHT);
++ const unsigned int pic_lines = pic->p[0].i_lines;
++ const unsigned int copy_n = sys->slice.line + scale_n <= pic_lines ? scale_n :
++ sys->slice.line >= pic_lines ? 0 :
++ pic_lines - sys->slice.line;
++
++ const unsigned int src_stride = buf->type->video.pitch[0];
++ const unsigned int dst_stride = pic->p[0].i_pitch;
++ uint8_t *dst = pic->p[0].p_pixels + sys->slice.line * dst_stride;
++ const uint8_t *src = buf->data + buf->type->video.offset[0];
++
++ if (src_stride == dst_stride) {
++ if (copy_n != 0)
++ memcpy(dst, src, src_stride * copy_n);
++ }
++ else {
++ unsigned int i;
++ for (i = 0; i != copy_n; ++i) {
++ memcpy(dst, src, __MIN(dst_stride, src_stride));
++ dst += dst_stride;
++ src += src_stride;
++ }
++ }
++ sys->slice.line += scale_n;
++ }
++
++ if ((buf->flags & MMAL_BUFFER_HEADER_FLAG_FRAME_END) != 0 || sys->slice.line >= scale_lines) {
++
++ if ((buf->flags & MMAL_BUFFER_HEADER_FLAG_FRAME_END) == 0 || sys->slice.line != scale_lines) {
++ // Stuff doesn't add up...
++ msg_Err(p_filter, "Line count (%d/%d) & EOF disagree (flags=%#x)", sys->slice.line, scale_lines, buf->flags);
++ goto fail;
++ }
++ else {
++ sys->slice.line = 0;
++
++ vlc_mutex_lock(&sys->lock);
++ pic_fifo_get(&sys->slice.pics); // Remove head from Q
++ vlc_mutex_unlock(&sys->lock);
++
++ buf_to_pic_copy_props(pic, buf);
++ conv_out_q_pic(sys, pic);
++ }
++ }
++ }
++
++ // Put back
++ buf->user_data = NULL; // Zap here to make sure we can't reuse later
++ mmal_buffer_header_reset(buf);
++
++ if (mmal_port_send_buffer(sys->output, buf) != MMAL_SUCCESS) {
++ mmal_buffer_header_release(buf);
++ }
++ return;
++
++fail:
++ sys->err_stream = MMAL_EIO;
++ vlc_sem_post(&sys->sem); // If we were waiting then break us out - the flush should fix sem values
++}
++
++
++static void conv_flush(filter_t * p_filter)
++{
++ filter_sys_t * const sys = p_filter->p_sys;
++ unsigned int i;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s", __func__);
++#endif
++
++ if (sys->resizer_type == FILTER_RESIZER_HVS)
++ {
++ for (i = 0; i != SUBS_MAX; ++i) {
++ hw_mmal_subpic_flush(VLC_OBJECT(p_filter), sys->subs + i);
++ }
++ }
++
++ if (sys->input != NULL && sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++
++ if (sys->output != NULL && sys->output->is_enabled)
++ mmal_port_disable(sys->output);
++
++// cma_buf_pool_deletez(&sys->cma_out_pool);
++
++ // Free up anything we may have already lying around
++ // Don't need lock as the above disables should have prevented anything
++ // happening in the background
++
++ for (i = 0; i != 16; ++i) {
++ conv_frame_stash_t *const stash = sys->stash + i;
++ unsigned int sub_no;
++
++ stash->pts = MMAL_TIME_UNKNOWN;
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ if (stash->sub_bufs[sub_no] != NULL) {
++ mmal_buffer_header_release(stash->sub_bufs[sub_no]);
++ stash->sub_bufs[sub_no] = NULL;
++ }
++ }
++ }
++
++ pic_fifo_release_all(&sys->slice.pics);
++ pic_fifo_release_all(&sys->ret_pics);
++
++ // Reset sem values - easiest & most reliable way is to just kill & re-init
++ vlc_sem_destroy(&sys->sem);
++ vlc_sem_init(&sys->sem, 0);
++ sys->pic_n = 0;
++
++ // Reset error status
++ sys->err_stream = MMAL_SUCCESS;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s", __func__);
++#endif
++}
++
++static void conv_stash_fixup(filter_t * const p_filter, filter_sys_t * const sys, picture_t * const p_pic)
++{
++ conv_frame_stash_t * const stash = sys->stash + (p_pic->date & 0xf);
++ unsigned int sub_no;
++ VLC_UNUSED(p_filter);
++
++ p_pic->date = stash->pts;
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ if (stash->sub_bufs[sub_no] != NULL) {
++ // **** Do stashed blend
++ // **** Aaargh, bother... need to rescale subs too
++
++ mmal_buffer_header_release(stash->sub_bufs[sub_no]);
++ stash->sub_bufs[sub_no] = NULL;
++ }
++ }
++}
++
++// Output buffers may contain a pic ref on error or flush
++// Free it
++static MMAL_BOOL_T out_buffer_pre_release_cb(MMAL_BUFFER_HEADER_T *header, void *userdata)
++{
++ VLC_UNUSED(userdata);
++
++ picture_t * const pic = header->user_data;
++ header->user_data = NULL;
++
++ if (pic != NULL)
++ picture_Release(pic);
++
++ return MMAL_FALSE;
++}
++
++static MMAL_STATUS_T conv_set_output(filter_t * const p_filter, filter_sys_t * const sys, picture_t * const pic)
++{
++ MMAL_STATUS_T status;
++
++ sys->output->userdata = (struct MMAL_PORT_USERDATA_T *)p_filter;
++ sys->output->format->type = MMAL_ES_TYPE_VIDEO;
++ sys->output->format->encoding = vlc_to_mmal_video_fourcc(&p_filter->fmt_out.video);
++ sys->output->format->encoding_variant = 0;
++ hw_mmal_vlc_fmt_to_mmal_fmt(sys->output->format, &p_filter->fmt_out.video);
++
++ if (pic != NULL)
++ {
++ // Override default format width/height if we have a pic we need to match
++ if ((status = pic_to_format(sys->output->format, pic)) != MMAL_SUCCESS)
++ {
++ char cbuf[5];
++ msg_Err(p_filter, "Bad format desc: %s, pic=%p, bits=%d", str_fourcc(cbuf, pic->format.i_chroma), pic, pic->format.i_bits_per_pixel);
++ return status;
++ }
++
++ MMAL_VIDEO_FORMAT_T *fmt = &sys->output->format->es->video;
++ msg_Dbg(p_filter, "%s: %dx%d [(0,0) %dx%d]", __func__, fmt->width, fmt->height, fmt->crop.width, fmt->crop.height);
++ }
++
++ if (sys->is_sliced) {
++ // Override height for slice
++ sys->output->format->es->video.height = MMAL_SLICE_HEIGHT;
++ }
++
++ mmal_log_dump_format(sys->output->format);
++
++ status = mmal_port_format_commit(sys->output);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(p_filter, "Failed to commit format for output port %s (status=%"PRIx32" %s)",
++ sys->output->name, status, mmal_status_to_string(status));
++ return status;
++ }
++
++ sys->output->buffer_num = __MAX(sys->is_sliced ? 16 : 2, sys->output->buffer_num_recommended);
++ sys->output->buffer_size = sys->output->buffer_size_recommended;
++
++ if ((status = conv_enable_out(p_filter, sys)) != MMAL_SUCCESS)
++ return status;
++
++ return MMAL_SUCCESS;
++}
++
++
++static picture_t *conv_get_out_pics(filter_sys_t * const sys)
++{
++ picture_t * ret_pics;
++
++ vlc_sem_wait(&sys->sem);
++
++ // Return a single pending buffer
++ vlc_mutex_lock(&sys->lock);
++ ret_pics = pic_fifo_get(&sys->ret_pics);
++ vlc_mutex_unlock(&sys->lock);
++
++ return ret_pics;
++}
++
++static picture_t *conv_filter(filter_t *p_filter, picture_t *p_pic)
++{
++ filter_sys_t * const sys = p_filter->p_sys;
++ picture_t * ret_pics = NULL;
++ MMAL_STATUS_T err;
++ const uint64_t frame_seq = ++sys->frame_seq;
++ conv_frame_stash_t * const stash = sys->stash + (frame_seq & 0xf);
++ MMAL_BUFFER_HEADER_T * out_buf = NULL;
++
++#if TRACE_ALL
++ {
++ char dbuf0[5], dbuf1[5];
++ msg_Dbg(p_filter, "<<< %s: %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d->%s,%dx%d [(%d,%d) %dx%d] sar:%d/%d", __func__,
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma), p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ p_filter->fmt_in.video.i_sar_num, p_filter->fmt_in.video.i_sar_den,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma), p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height,
++ p_filter->fmt_out.video.i_sar_num, p_filter->fmt_out.video.i_sar_den);
++ }
++#endif
++
++ if (sys->err_stream != MMAL_SUCCESS) {
++ goto stream_fail;
++ }
++
++ // Check pic fmt corresponds to what we have set up
++ if (hw_mmal_vlc_pic_to_mmal_fmt_update(sys->input->format, p_pic))
++ {
++ msg_Dbg(p_filter, "Reset input port format");
++
++ // HVS can take new formats without disable, others need it
++ if (sys->resizer_type != FILTER_RESIZER_HVS) {
++ // Extract any pending pic
++ if (sys->pic_n >= 2) {
++ ret_pics = conv_get_out_pics(sys);
++ // If pic_n == 1 then we return without trying to get stuff
++ sys->pic_n = 1;
++ }
++ if (sys->input->is_enabled) {
++ if ((err = mmal_port_disable(sys->input)) != MMAL_SUCCESS)
++ msg_Warn(p_filter, "Format update disable failed: %s", mmal_status_to_string(err));
++ }
++ }
++
++// mmal_log_dump_port(sys->input);
++ if ((err = mmal_port_format_commit(sys->input)) != MMAL_SUCCESS)
++ msg_Warn(p_filter, "Format update commit failed: %s", mmal_status_to_string(err));
++
++ // (Re)enable if required will be done later
++ }
++
++ if (p_pic->context == NULL) {
++ // Can't have stashed subpics if not one of our pics
++ if (!sys->needs_copy_in)
++ msg_Dbg(p_filter, "%s: No context", __func__);
++ }
++ else if (sys->resizer_type == FILTER_RESIZER_HVS)
++ {
++ unsigned int sub_no = 0;
++
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ int rv;
++ if ((rv = hw_mmal_subpic_update(VLC_OBJECT(p_filter),
++ hw_mmal_pic_sub_buf_get(p_pic, sub_no),
++ sys->subs + sub_no,
++ &p_pic->format,
++ &sys->output->format->es->video.crop,
++ MMAL_DISPLAY_ROT0,
++ frame_seq)) == 0)
++ break;
++ else if (rv < 0)
++ goto fail;
++ }
++ }
++ else
++ {
++ unsigned int sub_no = 0;
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ if ((stash->sub_bufs[sub_no] = hw_mmal_pic_sub_buf_get(p_pic, sub_no)) != NULL) {
++ mmal_buffer_header_acquire(stash->sub_bufs[sub_no]);
++ }
++ }
++ }
++
++ if (!sys->out_fmt_set) {
++ sys->out_fmt_set = true;
++
++ if (sys->is_sliced) {
++ // If zc then we will do stride conversion when we copy to arm side
++ // so no need to worry about actual pic dimensions here
++ if ((err = conv_set_output(p_filter, sys, NULL)) != MMAL_SUCCESS)
++ goto fail;
++
++ sys->out_pool = mmal_port_pool_create(sys->output, sys->output->buffer_num, sys->output->buffer_size);
++ }
++ else {
++ picture_t *pic = filter_NewPicture(p_filter);
++ err = conv_set_output(p_filter, sys, pic);
++ picture_Release(pic);
++ if (err != MMAL_SUCCESS)
++ goto fail;
++
++ sys->out_pool = mmal_pool_create(sys->output->buffer_num, 0);
++ }
++
++ if (sys->out_pool == NULL) {
++ msg_Err(p_filter, "Failed to create output pool");
++ goto fail;
++ }
++ }
++
++ // Reenable stuff if the last thing we did was flush
++ if ((err = conv_enable_out(p_filter, sys)) != MMAL_SUCCESS ||
++ (err = conv_enable_in(p_filter, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ // We attach pic to buf before stuffing the output port
++ // We could attach the pic on output for cma, but it is a lot easier to keep
++ // the code common.
++ {
++ picture_t * const out_pic = filter_NewPicture(p_filter);
++
++ if (out_pic == NULL)
++ {
++ msg_Err(p_filter, "Failed to alloc required filter output pic");
++ goto fail;
++ }
++
++ out_pic->format.i_sar_den = p_filter->fmt_out.video.i_sar_den;
++ out_pic->format.i_sar_num = p_filter->fmt_out.video.i_sar_num;
++
++ if (sys->is_sliced) {
++ vlc_mutex_lock(&sys->lock);
++ pic_fifo_put(&sys->slice.pics, out_pic);
++ vlc_mutex_unlock(&sys->lock);
++
++ // Poke any returned pic buffers into output
++ // In general this should only happen immediately after enable
++ while ((out_buf = mmal_queue_get(sys->out_pool->queue)) != NULL)
++ mmal_port_send_buffer(sys->output, out_buf);
++ }
++ else
++ {
++ // 1 in - 1 out
++ if ((out_buf = mmal_queue_wait(sys->out_pool->queue)) == NULL)
++ {
++ msg_Err(p_filter, "Failed to get output buffer");
++ picture_Release(out_pic);
++ goto fail;
++ }
++ mmal_buffer_header_reset(out_buf);
++
++ // Attach out_pic to the buffer & ensure it is freed when the buffer is released
++ // On a good send callback the pic will be extracted to avoid this
++ out_buf->user_data = out_pic;
++ mmal_buffer_header_pre_release_cb_set(out_buf, out_buffer_pre_release_cb, NULL);
++
++#if 0
++ {
++ char dbuf0[5];
++ msg_Dbg(p_filter, "out_pic %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d",
++ str_fourcc(dbuf0, out_pic->format.i_chroma),
++ out_pic->format.i_width, out_pic->format.i_height,
++ out_pic->format.i_x_offset, out_pic->format.i_y_offset,
++ out_pic->format.i_visible_width, out_pic->format.i_visible_height,
++ out_pic->format.i_sar_num, out_pic->format.i_sar_den);
++ }
++#endif
++
++ if (sys->is_cma) {
++ int rv;
++
++ cma_buf_t * const cb = cma_buf_pool_alloc_buf(sys->cma_out_pool, sys->output->buffer_size);
++ if (cb == NULL) {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to alloc CMA buf: fmt=%s, size=%d",
++ str_fourcc(dbuf0, out_pic->format.i_chroma),
++ sys->output->buffer_size);
++ goto fail;
++ }
++ const unsigned int vc_h = cma_buf_vc_handle(cb); // Cannot coerce without going via variable
++ out_buf->data = (uint8_t *)vc_h;
++ out_buf->alloc_size = sys->output->buffer_size;
++
++ if ((rv = cma_buf_pic_attach(cb, out_pic)) != VLC_SUCCESS)
++ {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to attach CMA to pic: fmt=%s err=%d",
++ str_fourcc(dbuf0, out_pic->format.i_chroma),
++ rv);
++ cma_buf_unref(cb);
++ goto fail;
++ }
++ }
++ else {
++ out_buf->data = out_pic->p[0].p_pixels;
++ out_buf->alloc_size = out_pic->p[0].i_pitch * out_pic->p[0].i_lines;
++ //**** stride ????
++ }
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "Out buf send: pic=%p, data=%p, user=%p, flags=%#x, len=%d/%d, pts=%lld",
++ p_pic, out_buf->data, out_buf->user_data, out_buf->flags,
++ out_buf->length, out_buf->alloc_size, (long long)out_buf->pts);
++#endif
++
++ if ((err = mmal_port_send_buffer(sys->output, out_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to output failed");
++ goto fail;
++ }
++ out_buf = NULL;
++ }
++ }
++
++
++ // Stuff into input
++ // We assume the BH is already set up with values reflecting pic date etc.
++ stash->pts = p_pic->date;
++ {
++ MMAL_BUFFER_HEADER_T *const pic_buf = sys->needs_copy_in ?
++ hw_mmal_pic_buf_copied(p_pic, sys->in_pool, sys->input, sys->cma_in_pool) :
++ hw_mmal_pic_buf_replicated(p_pic, sys->in_pool);
++
++ // Whether or not we extracted the pic_buf we are done with the picture
++ picture_Release(p_pic);
++ p_pic = NULL;
++
++ if (pic_buf == NULL) {
++ msg_Err(p_filter, "Pic has no attached buffer");
++ goto fail;
++ }
++
++ pic_buf->pts = frame_seq;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "In buf send: pic=%p, data=%p, user=%p, flags=%#x, len=%d/%d/%d, pts=%lld",
++ p_pic, pic_buf->data, pic_buf->user_data, pic_buf->flags,
++ pic_buf->length, pic_buf->alloc_size, sys->input->buffer_size, (long long)pic_buf->pts);
++#endif
++
++ if ((err = mmal_port_send_buffer(sys->input, pic_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to input failed");
++ mmal_buffer_header_release(pic_buf);
++ goto fail;
++ }
++ }
++
++ // We have a 1 pic latency for everything except the 1st pic which we
++ // wait for.
++ // This means we get a single static pic out
++ if (sys->pic_n++ == 1) {
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: Pic1=%p", __func__, ret_pics);
++#endif
++ return ret_pics;
++ }
++
++ ret_pics = conv_get_out_pics(sys);
++
++ if (sys->err_stream != MMAL_SUCCESS)
++ goto stream_fail;
++
++ conv_stash_fixup(p_filter, sys, ret_pics);
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: pic=%p", __func__, ret_pics);
++#endif
++
++ return ret_pics;
++
++stream_fail:
++ msg_Err(p_filter, "MMAL error reported by callback");
++fail:
++#if TRACE_ALL
++ msg_Err(p_filter, ">>> %s: FAIL", __func__);
++#endif
++ if (ret_pics != NULL)
++ picture_Release(ret_pics);
++ if (out_buf != NULL)
++ mmal_buffer_header_release(out_buf);
++ if (p_pic != NULL)
++ picture_Release(p_pic);
++ conv_flush(p_filter);
++ return NULL;
++}
++
++static void CloseConverter(vlc_object_t * obj)
++{
++ filter_t * const p_filter = (filter_t *)obj;
++ filter_sys_t * const sys = p_filter->p_sys;
++ unsigned int i;
++
++#if TRACE_ALL
++ msg_Dbg(obj, "<<< %s", __func__);
++#endif
++
++ if (sys == NULL)
++ return;
++
++ // Disables input & output ports
++ conv_flush(p_filter);
++
++ cma_buf_pool_deletez(&sys->cma_in_pool);
++ cma_buf_pool_deletez(&sys->cma_out_pool);
++
++ if (sys->component && sys->component->control->is_enabled)
++ mmal_port_disable(sys->component->control);
++
++ if (sys->component && sys->component->is_enabled)
++ mmal_component_disable(sys->component);
++
++ if (sys->resizer_type == FILTER_RESIZER_HVS)
++ {
++ for (i = 0; i != SUBS_MAX; ++i) {
++ hw_mmal_subpic_close(VLC_OBJECT(p_filter), sys->subs + i);
++ }
++ }
++
++ if (sys->out_pool)
++ {
++ if (sys->is_sliced)
++ mmal_port_pool_destroy(sys->output, sys->out_pool);
++ else
++ mmal_pool_destroy(sys->out_pool);
++ }
++
++ if (sys->in_pool != NULL)
++ mmal_pool_destroy(sys->in_pool);
++
++ if (sys->component)
++ mmal_component_release(sys->component);
++
++ cma_vcsm_exit(sys->vcsm_init_type);
++
++ vlc_sem_destroy(&sys->sem);
++ vlc_mutex_destroy(&sys->lock);
++
++ p_filter->p_sys = NULL;
++ free(sys);
++}
++
++
++static inline MMAL_FOURCC_T filter_enc_in(const video_format_t * const fmt)
++{
++ if (hw_mmal_chroma_is_mmal(fmt->i_chroma))
++ return vlc_to_mmal_video_fourcc(fmt);
++
++ if (fmt->i_chroma == VLC_CODEC_I420 ||
++ fmt->i_chroma == VLC_CODEC_I420_10L)
++ return MMAL_ENCODING_I420;
++
++ return 0;
++}
++
++static inline MMAL_FOURCC_T filter_enc_out(const video_format_t * const fmt)
++{
++ const MMAL_FOURCC_T mmes = vlc_to_mmal_video_fourcc(fmt);
++ // Can only copy out single plane stuff currently - this could be fixed!
++ return hw_mmal_chroma_is_mmal(fmt->i_chroma) || mmes != MMAL_ENCODING_I420 ? mmes : 0;
++}
++
++
++static int OpenConverter(vlc_object_t * obj)
++{
++ filter_t * const p_filter = (filter_t *)obj;
++ int ret = VLC_EGENERIC;
++ filter_sys_t *sys;
++ MMAL_STATUS_T status;
++ MMAL_FOURCC_T enc_out = filter_enc_out(&p_filter->fmt_out.video);
++ const MMAL_FOURCC_T enc_in = filter_enc_in(&p_filter->fmt_in.video);
++ bool use_resizer;
++ bool use_isp;
++ int gpu_mem;
++
++ // At least in principle we should deal with any mmal format as input
++ if (enc_in == 0 || enc_out == 0)
++ return VLC_EGENERIC;
++
++ // Can't transform
++ if (p_filter->fmt_in.video.orientation != p_filter->fmt_out.video.orientation)
++ return VLC_EGENERIC;
++
++ use_resizer = var_InheritBool(p_filter, MMAL_RESIZE_NAME);
++ use_isp = var_InheritBool(p_filter, MMAL_ISP_NAME);
++
++retry:
++ // ** Make more generic by checking supported encs
++ //
++ // Must use ISP - HVS can't do this, nor can resizer
++ if (enc_in == MMAL_ENCODING_YUVUV64_10) {
++ // If resizer selected then just give up
++ if (use_resizer)
++ return VLC_EGENERIC;
++ // otherwise downgrade HVS to ISP
++ use_isp = true;
++ }
++ // HVS can't do I420
++ if (enc_out == MMAL_ENCODING_I420) {
++ use_isp = true;
++ }
++ // Only HVS can deal with SAND30
++ if (enc_in == MMAL_ENCODING_YUV10_COL) {
++ if (use_isp || use_resizer)
++ return VLC_EGENERIC;
++ }
+
+- sys->output_format = format;
+
+- mmal_buffer_header_release(buffer);
++ if (use_resizer) {
++ // use resizer overrides use_isp
++ use_isp = false;
++ }
++
++ // Check we have a sliced version of the fourcc if we want the resizer
++ if (use_resizer &&
++ (enc_out = pic_to_slice_mmal_fourcc(enc_out)) == 0) {
++ return VLC_EGENERIC;
++ }
++
++ gpu_mem = hw_mmal_get_gpu_mem();
++
++ {
++ char dbuf0[5], dbuf1[5], dbuf2[5], dbuf3[5];
++ msg_Dbg(p_filter, "%s: (%s) %s/%s,%dx%d [(%d,%d) %d/%d] sar:%d/%d->%s/%s,%dx%d [(%d,%d) %dx%d] rgb:%#x:%#x:%#x sar:%d/%d (gpu=%d)", __func__,
++ use_resizer ? "resize" : use_isp ? "isp" : "hvs",
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma), str_fourcc(dbuf2, enc_in),
++ p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ p_filter->fmt_in.video.i_sar_num, p_filter->fmt_in.video.i_sar_den,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma), str_fourcc(dbuf3, enc_out),
++ p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height,
++ p_filter->fmt_out.video.i_rmask, p_filter->fmt_out.video.i_gmask, p_filter->fmt_out.video.i_bmask,
++ p_filter->fmt_out.video.i_sar_num, p_filter->fmt_out.video.i_sar_den,
++ gpu_mem);
++ }
++
++ sys = calloc(1, sizeof(filter_sys_t));
++ if (!sys) {
++ ret = VLC_ENOMEM;
++ goto fail;
++ }
++ p_filter->p_sys = sys;
++
++ // Init stuff the we destroy unconditionaly in Close first
++ vlc_mutex_init(&sys->lock);
++ vlc_sem_init(&sys->sem, 0);
++ sys->err_stream = MMAL_SUCCESS;
++ pic_fifo_init(&sys->ret_pics);
++ pic_fifo_init(&sys->slice.pics);
++
++ sys->needs_copy_in = !hw_mmal_chroma_is_mmal(p_filter->fmt_in.video.i_chroma);
++ sys->in_port_cb_fn = conv_input_port_cb;
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(p_filter, "VCSM init failed");
++ goto fail;
++ }
++
++ if (use_resizer) {
++ sys->resizer_type = FILTER_RESIZER_RESIZER;
++ sys->is_sliced = true;
++ sys->component_name = MMAL_COMPONENT_DEFAULT_RESIZER;
++ sys->out_port_cb_fn = slice_output_port_cb;
++ }
++ else if (use_isp) {
++ sys->resizer_type = FILTER_RESIZER_ISP;
++ sys->is_sliced = false; // Copy directly into filter picture
++ sys->component_name = MMAL_COMPONENT_ISP_RESIZER;
++ sys->out_port_cb_fn = conv_output_port_cb;
+ } else {
+- mmal_buffer_header_release(buffer);
++ sys->resizer_type = FILTER_RESIZER_HVS;
++ sys->is_sliced = false; // Copy directly into filter picture
++ sys->component_name = MMAL_COMPONENT_HVS;
++ sys->out_port_cb_fn = conv_output_port_cb;
++ }
++ sys->is_cma = is_cma_buf_pic_chroma(p_filter->fmt_out.video.i_chroma);
++
++ status = mmal_component_create(sys->component_name, &sys->component);
++ if (status != MMAL_SUCCESS) {
++ if (!use_isp && !use_resizer) {
++ msg_Warn(p_filter, "Failed to rcreate HVS resizer - retrying with ISP");
++ CloseConverter(obj);
++ use_isp = true;
++ goto retry;
++ }
++ msg_Err(p_filter, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_VIDEO_DECODER, status, mmal_status_to_string(status));
++ goto fail;
+ }
++ sys->output = sys->component->output[0];
++ sys->input = sys->component->input[0];
++
++ sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)p_filter;
++ status = mmal_port_enable(sys->component->control, conv_control_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(p_filter, "Failed to enable control port %s (status=%"PRIx32" %s)",
++ sys->component->control->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if (sys->needs_copy_in &&
++ (sys->cma_in_pool = cma_buf_pool_new(2, 2, true, "conv-copy-in")) == NULL)
++ {
++ msg_Err(p_filter, "Failed to allocate input CMA pool");
++ goto fail;
++ }
++
++ sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)p_filter;
++ sys->input->format->type = MMAL_ES_TYPE_VIDEO;
++ sys->input->format->encoding = enc_in;
++ sys->input->format->encoding_variant = MMAL_ENCODING_I420;
++ hw_mmal_vlc_fmt_to_mmal_fmt(sys->input->format, &p_filter->fmt_in.video);
++ port_parameter_set_bool(sys->input, MMAL_PARAMETER_ZERO_COPY, 1);
++
++ mmal_log_dump_format(sys->input->format);
++
++ status = mmal_port_format_commit(sys->input);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(p_filter, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++ sys->input->buffer_num = NUM_DECODER_BUFFER_HEADERS;
++
++ if ((status = conv_enable_in(p_filter, sys)) != MMAL_SUCCESS)
++ goto fail;
++
++ port_parameter_set_bool(sys->output, MMAL_PARAMETER_ZERO_COPY, sys->is_sliced || sys->is_cma);
++
++ status = mmal_component_enable(sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(p_filter, "Failed to enable component %s (status=%"PRIx32" %s)",
++ sys->component->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if ((sys->in_pool = mmal_pool_create(sys->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(p_filter, "Failed to create input pool");
++ goto fail;
++ }
++
++ if (sys->resizer_type == FILTER_RESIZER_HVS)
++ {
++ unsigned int i;
++ for (i = 0; i != SUBS_MAX; ++i) {
++ if (hw_mmal_subpic_open(VLC_OBJECT(p_filter), sys->subs + i, sys->component->input[i + 1], -1, i + 1) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Failed to open subpic %d", i);
++ goto fail;
++ }
++ }
++ }
++
++ p_filter->pf_video_filter = conv_filter;
++ p_filter->pf_flush = conv_flush;
++ // video_drain NIF in filter structure
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: ok", __func__);
++#endif
++
++ return VLC_SUCCESS;
++
++fail:
++ CloseConverter(obj);
++
++ if (!use_resizer && status == MMAL_ENOMEM) {
++ use_resizer = true;
++ msg_Warn(p_filter, "Lack of memory to use HVS/ISP: trying resizer");
++ goto retry;
++ }
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: FAIL: %d", __func__, ret);
++#endif
++ return ret;
++}
++
++#if OPT_TO_FROM_ZC
++//----------------------------------------------------------------------------
++//
++// Simple copy in to ZC
++
++typedef struct to_zc_sys_s {
++ vcsm_init_type_t vcsm_init_type;
++ cma_buf_pool_t * cma_out_pool;
++} to_zc_sys_t;
++
++
++static size_t buf_alloc_size(const vlc_fourcc_t i_chroma, const unsigned int width, const unsigned int height)
++{
++ const unsigned int pels = width * height;
++
++ switch (i_chroma)
++ {
++ case VLC_CODEC_MMAL_ZC_RGB32:
++ return pels * 4;
++ case VLC_CODEC_MMAL_ZC_I420:
++ return pels * 3 / 2;
++ default:
++ break;
++ }
++ return 0;
++}
++
++
++static picture_t *
++to_zc_filter(filter_t *p_filter, picture_t *in_pic)
++{
++ to_zc_sys_t * const sys = (to_zc_sys_t *)p_filter->p_sys;
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s", __func__);
++#endif
++
++ assert(p_filter->fmt_out.video.i_chroma == VLC_CODEC_MMAL_ZC_I420);
++
++ picture_t * const out_pic = filter_NewPicture(p_filter);
++ if (out_pic == NULL)
++ goto fail0;
++
++ MMAL_ES_SPECIFIC_FORMAT_T mm_vfmt = {.video={0}};
++ MMAL_ES_FORMAT_T mm_esfmt = {
++ .encoding = vlc_to_mmal_video_fourcc(&p_filter->fmt_out.video),
++ .es = &mm_vfmt};
++
++ hw_mmal_vlc_fmt_to_mmal_fmt(&mm_esfmt, &p_filter->fmt_out.video);
++
++ const size_t buf_alloc = buf_alloc_size(p_filter->fmt_out.video.i_chroma,
++ mm_vfmt.video.width, mm_vfmt.video.height);
++ if (buf_alloc == 0)
++ goto fail1;
++ cma_buf_t *const cb = cma_buf_pool_alloc_buf(sys->cma_out_pool, buf_alloc);
++ if (cb == NULL)
++ goto fail1;
++
++ if (cma_buf_pic_attach(cb, out_pic) != VLC_SUCCESS)
++ goto fail2;
++ cma_pic_set_data(out_pic, &mm_esfmt, NULL);
++
++ hw_mmal_copy_pic_to_buf(cma_buf_addr(cb), NULL, &mm_esfmt, in_pic);
++
++ // Copy pic properties
++ out_pic->date = in_pic->date;
++ out_pic->b_force = in_pic->b_force;
++ out_pic->b_progressive = in_pic->b_progressive;
++ out_pic->b_top_field_first = in_pic->b_top_field_first;
++ out_pic->i_nb_fields = in_pic->i_nb_fields;
++
++ picture_Release(in_pic);
++
++ return out_pic;
++
++fail2:
++ cma_buf_unref(cb);
++fail1:
++ picture_Release(out_pic);
++fail0:
++ picture_Release(in_pic);
++ return NULL;
++}
++
++static void to_zc_flush(filter_t * p_filter)
++{
++ VLC_UNUSED(p_filter);
+ }
++
++static void CloseConverterToZc(vlc_object_t * obj)
++{
++ filter_t * const p_filter = (filter_t *)obj;
++ to_zc_sys_t * const sys = (to_zc_sys_t *)p_filter->p_sys;
++
++ if (sys == NULL)
++ return;
++
++ p_filter->p_sys = NULL;
++
++ cma_buf_pool_deletez(&sys->cma_out_pool);
++ cma_vcsm_exit(sys->vcsm_init_type);
++
++ free(sys);
++}
++
++static bool to_zc_validate_fmt(const video_format_t * const f_in, const video_format_t * const f_out)
++{
++ if (!((f_in->i_chroma == VLC_CODEC_I420 || f_in->i_chroma == VLC_CODEC_I420_10L) &&
++ f_out->i_chroma == VLC_CODEC_MMAL_ZC_I420))
++ {
++ return false;
++ }
++ if (f_in->i_height != f_out->i_height ||
++ f_in->i_width != f_out->i_width)
++ {
++ return false;
++ }
++
++ return true;
++}
++
++static int OpenConverterToZc(vlc_object_t * obj)
++{
++ int ret = VLC_EGENERIC;
++ filter_t * const p_filter = (filter_t *)obj;
++
++ if (!to_zc_validate_fmt(&p_filter->fmt_in.video, &p_filter->fmt_out.video))
++ goto fail;
++
++ {
++ char dbuf0[5], dbuf1[5];
++ msg_Dbg(p_filter, "%s: %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d->%s,%dx%d [(%d,%d) %dx%d] rgb:%#x:%#x:%#x sar:%d/%d", __func__,
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma),
++ p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ p_filter->fmt_in.video.i_sar_num, p_filter->fmt_in.video.i_sar_den,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma),
++ p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height,
++ p_filter->fmt_out.video.i_rmask, p_filter->fmt_out.video.i_gmask, p_filter->fmt_out.video.i_bmask,
++ p_filter->fmt_out.video.i_sar_num, p_filter->fmt_out.video.i_sar_den);
++ }
++
++ to_zc_sys_t * const sys = calloc(1, sizeof(*sys));
++ if (!sys) {
++ ret = VLC_ENOMEM;
++ goto fail;
++ }
++ p_filter->p_sys = (filter_sys_t *)sys;
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(p_filter, "VCSM init failed");
++ goto fail;
++ }
++
++ if ((sys->cma_out_pool = cma_buf_pool_new(5, 5, true, "conv-to-zc")) == NULL)
++ {
++ msg_Err(p_filter, "Failed to allocate input CMA pool");
++ goto fail;
++ }
++
++ p_filter->pf_video_filter = to_zc_filter;
++ p_filter->pf_flush = to_zc_flush;
++ return VLC_SUCCESS;
++
++fail:
++ CloseConverterToZc(obj);
++ return ret;
++}
++
++//----------------------------------------------------------------------------
++//
++// Simple "copy" from ZC
++
++static void CloseConverterFromZc(vlc_object_t * obj)
++{
++ VLC_UNUSED(obj);
++}
++
++static int OpenConverterFromZc(vlc_object_t * obj)
++{
++ return VLC_EGENERIC;
++}
++#endif
++//----------------------------------------------------------------------------
++
++typedef struct blend_sys_s {
++ vzc_pool_ctl_t * vzc;
++ const picture_t * last_dst; // Not a ref, just a hint that we have a new pic
++ vcsm_init_type_t vcsm_init_type;
++} blend_sys_t;
++
++static void FilterBlendMmal(filter_t *p_filter,
++ picture_t *dst, const picture_t * src,
++ int x_offset, int y_offset, int alpha)
++{
++ blend_sys_t * const sys = (blend_sys_t *)p_filter->p_sys;
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s (%d,%d:%d) pic=%p, pts=%lld, force=%d", __func__, x_offset, y_offset, alpha, src, src->date, src->b_force);
++#endif
++ // If nothing to do then do nothing
++ if (alpha == 0 ||
++ src->format.i_visible_height == 0 ||
++ src->format.i_visible_width == 0)
++ {
++ return;
++ }
++
++ if (dst->context == NULL)
++ msg_Err(p_filter, "MMAL pic missing context");
++ else
++ {
++ // cast away src const so we can ref it
++ MMAL_BUFFER_HEADER_T *buf = hw_mmal_vzc_buf_from_pic(sys->vzc, (picture_t *)src,
++ vis_mmal_rect(&dst->format),
++ x_offset, y_offset,
++ alpha,
++ dst != sys->last_dst || !hw_mmal_pic_has_sub_bufs(dst));
++ if (buf == NULL) {
++ msg_Err(p_filter, "Failed to allocate vzc buffer for subpic");
++ return;
++ }
++
++ hw_mmal_pic_sub_buf_add(dst, buf);
++
++ sys->last_dst = dst;
++ }
++}
++
++static void FlushBlendMmal(filter_t * p_filter)
++{
++ blend_sys_t * const sys = (blend_sys_t *)p_filter->p_sys;
++ sys->last_dst = NULL;
++ hw_mmal_vzc_pool_flush(sys->vzc);
++}
++
++static void CloseBlendMmal(vlc_object_t *object)
++{
++ filter_t * const p_filter = (filter_t *)object;
++ blend_sys_t * const sys = (blend_sys_t *)p_filter->p_sys;
++
++ if (sys != NULL) {
++ p_filter->p_sys = NULL;
++
++ hw_mmal_vzc_pool_release(sys->vzc);
++ cma_vcsm_exit(sys->vcsm_init_type);
++ free(sys);
++ }
++}
++
++static int OpenBlendMmal(vlc_object_t *object)
++{
++ filter_t * const p_filter = (filter_t *)object;
++ const vlc_fourcc_t vfcc_dst = p_filter->fmt_out.video.i_chroma;
++
++ if (!hw_mmal_chroma_is_mmal(vfcc_dst) ||
++ !hw_mmal_vzc_subpic_fmt_valid(&p_filter->fmt_in.video))
++ {
++ return VLC_EGENERIC;
++ }
++
++ {
++ char dbuf0[5], dbuf1[5];
++ msg_Dbg(p_filter, "%s: (%s) %s,%dx%d [(%d,%d) %dx%d]->%s,%dx%d [(%d,%d) %dx%d]", __func__,
++ "blend",
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma), p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma), p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height);
++ }
++
++ {
++ blend_sys_t * const sys = calloc(1, sizeof (*sys));
++ if (sys == NULL)
++ return VLC_ENOMEM;
++
++ p_filter->p_sys = (filter_sys_t *)sys;
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(p_filter, "VCSM init failed");
++ goto fail;
++ }
++
++ if ((sys->vzc = hw_mmal_vzc_pool_new()) == NULL)
++ goto fail;
++ }
++
++ p_filter->pf_video_blend = FilterBlendMmal;
++ p_filter->pf_flush = FlushBlendMmal;
++
++ return VLC_SUCCESS;
++
++fail:
++ CloseBlendMmal(VLC_OBJECT(p_filter));
++ return VLC_ENOMEM;
++}
++
++// ---------------------------------------------------------------------------
++
++static void FilterBlendNeon(filter_t *p_filter,
++ picture_t *dst_pic, const picture_t * src_pic,
++ int x_offset, int y_offset, int alpha)
++{
++ const uint8_t * s_data;
++ uint8_t * d_data;
++ int width = src_pic->format.i_visible_width;
++ int height = src_pic->format.i_visible_height;
++ blend_neon_fn *const blend_fn = (blend_neon_fn * )p_filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s (%d,%d:%d) pic=%p, pts=%lld, force=%d", __func__, x_offset, y_offset, alpha, src_pic, src_pic->date, src_pic->b_force);
++#endif
++
++ if (alpha == 0 ||
++ src_pic->format.i_visible_height == 0 ||
++ src_pic->format.i_visible_width == 0)
++ {
++ return;
++ }
++
++ x_offset += dst_pic->format.i_x_offset;
++ y_offset += dst_pic->format.i_y_offset;
++
++ // Deal with R/B overrun
++ if (x_offset + width >= (int)(dst_pic->format.i_x_offset + dst_pic->format.i_visible_width))
++ width = dst_pic->format.i_x_offset + dst_pic->format.i_visible_width - x_offset;
++ if (y_offset + height >= (int)(dst_pic->format.i_y_offset + dst_pic->format.i_visible_height))
++ height = dst_pic->format.i_y_offset + dst_pic->format.i_visible_height - y_offset;
++
++ if (width <= 0 || height <= 0) {
++ return;
++ }
++
++ // *** L/U overrun
++
++ s_data = src_pic->p[0].p_pixels +
++ src_pic->p[0].i_pixel_pitch * src_pic->format.i_x_offset +
++ src_pic->p[0].i_pitch * src_pic->format.i_y_offset;
++ d_data = dst_pic->p[0].p_pixels +
++ dst_pic->p[0].i_pixel_pitch * x_offset +
++ dst_pic->p[0].i_pitch * y_offset;
++
++
++ do {
++ blend_fn(d_data, s_data, alpha, width);
++ s_data += src_pic->p[0].i_pitch;
++ d_data += dst_pic->p[0].i_pitch;
++ } while (--height > 0);
++}
++
++static void CloseBlendNeon(vlc_object_t *object)
++{
++ VLC_UNUSED(object);
++}
++
++static int OpenBlendNeon(vlc_object_t *object)
++{
++ filter_t * const p_filter = (filter_t *)object;
++ const vlc_fourcc_t vfcc_dst = p_filter->fmt_out.video.i_chroma;
++ MMAL_FOURCC_T mfcc_src = vlc_to_mmal_video_fourcc(&p_filter->fmt_in.video);
++ MMAL_FOURCC_T mfcc_dst = vlc_to_mmal_video_fourcc(&p_filter->fmt_out.video);
++ blend_neon_fn * blend_fn = (blend_neon_fn *)0;
++
++ // Non-alpha RGB only for dest
++ if (vfcc_dst != VLC_CODEC_RGB32)
++ return VLC_EGENERIC;
++
++ // Check we have appropriate blend fn (mmal doesn't have a non-alpha RGB32)
++ switch (mfcc_src) {
++ case MMAL_ENCODING_RGBA:
++ if (mfcc_dst == MMAL_ENCODING_RGBA)
++ blend_fn = blend_rgbx_rgba_neon;
++ else if (mfcc_dst == MMAL_ENCODING_BGRA)
++ blend_fn = blend_bgrx_rgba_neon;
++ break;
++
++ case MMAL_ENCODING_BGRA:
++ if (mfcc_dst == MMAL_ENCODING_BGRA)
++ blend_fn = blend_rgbx_rgba_neon;
++ else if (mfcc_dst == MMAL_ENCODING_RGBA)
++ blend_fn = blend_bgrx_rgba_neon;
++ break;
++
++ default:
++ break;
++ }
++
++ if (blend_fn == (blend_neon_fn *)0)
++ {
++ return VLC_EGENERIC;
++ }
++
++ p_filter->p_sys = (void *)blend_fn;
++ p_filter->pf_video_blend = FilterBlendNeon;
++
++ {
++ char dbuf0[5], dbuf1[5];
++ char dbuf0a[5], dbuf1a[5];
++ msg_Dbg(p_filter, "%s: (%s) %s/%s,%dx%d [(%d,%d) %dx%d]->%s/%s,%dx%d [(%d,%d) %dx%d]", __func__,
++ "blend",
++ str_fourcc(dbuf0, p_filter->fmt_in.video.i_chroma),
++ str_fourcc(dbuf0a, mfcc_src),
++ p_filter->fmt_in.video.i_width, p_filter->fmt_in.video.i_height,
++ p_filter->fmt_in.video.i_x_offset, p_filter->fmt_in.video.i_y_offset,
++ p_filter->fmt_in.video.i_visible_width, p_filter->fmt_in.video.i_visible_height,
++ str_fourcc(dbuf1, p_filter->fmt_out.video.i_chroma),
++ str_fourcc(dbuf1a, mfcc_dst),
++ p_filter->fmt_out.video.i_width, p_filter->fmt_out.video.i_height,
++ p_filter->fmt_out.video.i_x_offset, p_filter->fmt_out.video.i_y_offset,
++ p_filter->fmt_out.video.i_visible_width, p_filter->fmt_out.video.i_visible_height);
++ }
++
++ return VLC_SUCCESS;
++}
++
++vlc_module_begin()
++ set_category( CAT_INPUT )
++ set_subcategory( SUBCAT_INPUT_VCODEC )
++ set_shortname(N_("MMAL decoder"))
++ set_description(N_("MMAL-based decoder plugin for Raspberry Pi"))
++ set_capability("video decoder", 90)
++ add_shortcut("mmal_decoder")
++ add_bool(MMAL_OPAQUE_NAME, true, MMAL_OPAQUE_TEXT, MMAL_OPAQUE_LONGTEXT, false)
++ set_callbacks(OpenDecoder, CloseDecoder)
++
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_shortname(N_("MMAL resizer"))
++ set_description(N_("MMAL resizing conversion filter"))
++ add_shortcut("mmal_converter")
++ set_capability( "video converter", 900 )
++ add_bool(MMAL_RESIZE_NAME, /* default */ false, MMAL_RESIZE_TEXT, MMAL_RESIZE_LONGTEXT, /* advanced option */ false)
++ add_bool(MMAL_ISP_NAME, /* default */ false, MMAL_ISP_TEXT, MMAL_ISP_LONGTEXT, /* advanced option */ false)
++ set_callbacks(OpenConverter, CloseConverter)
++
++#if OPT_TO_FROM_ZC
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_shortname(N_("MMAL to ZC"))
++ set_description(N_("MMAL conversion to ZC filter"))
++ add_shortcut("mmal_to_zc")
++ set_capability( "video converter", 901 )
++ set_callbacks(OpenConverterToZc, CloseConverterToZc)
++
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_shortname(N_("MMAL from ZC"))
++ set_description(N_("MMAL conversion from ZC filter"))
++ add_shortcut("mmal_from_zc")
++ set_capability( "video converter", 902 )
++ set_callbacks(OpenConverterFromZc, CloseConverterFromZc)
++#endif
++
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_description(N_("Video pictures blending for MMAL"))
++ add_shortcut("mmal_blend")
++ set_capability("video blending", 120)
++ set_callbacks(OpenBlendMmal, CloseBlendMmal)
++
++ add_submodule()
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VFILTER )
++ set_description(N_("Video pictures blending for neon"))
++ add_shortcut("neon_blend")
++ set_capability("video blending", 110)
++ set_callbacks(OpenBlendNeon, CloseBlendNeon)
++
++vlc_module_end()
++
++
+--- /dev/null
++++ b/modules/hw/mmal/converter_mmal.c
+@@ -0,0 +1,479 @@
++#ifdef HAVE_CONFIG_H
++# include "config.h"
++#endif
++
++#include <unistd.h>
++#include <fcntl.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++
++#include <interface/vcsm/user-vcsm.h>
++
++#include <vlc_common.h>
++#include <vlc_picture.h>
++
++#include <libdrm/drm_fourcc.h>
++#include <EGL/egl.h>
++#include <EGL/eglext.h>
++#include <GLES2/gl2.h>
++#include <GLES2/gl2ext.h>
++
++#include "mmal_cma.h"
++
++#include "../../video_output/opengl/converter.h"
++
++#include "mmal_picture.h"
++
++#include <assert.h>
++
++#define TRACE_ALL 0
++
++typedef struct mmal_gl_converter_s
++{
++ EGLint drm_fourcc;
++ vcsm_init_type_t vcsm_init_type;
++ cma_buf_t * last_cb;
++
++ PFNGLEGLIMAGETARGETTEXTURE2DOESPROC glEGLImageTargetTexture2DOES;
++} mmal_gl_converter_t;
++
++
++static EGLint vlc_to_gl_fourcc(const video_format_t * const fmt)
++{
++ // Converting to mmal selects the right RGB32 varient
++ switch(vlc_to_mmal_video_fourcc(fmt))
++ {
++ case MMAL_ENCODING_I420:
++ return MMAL_FOURCC('Y','U','1','2');
++ case MMAL_ENCODING_YV12:
++ return MMAL_FOURCC('Y','V','1','2');
++ case MMAL_ENCODING_I422:
++ return MMAL_FOURCC('Y','U','1','6');
++// case MMAL_ENCODING_YUVUV128: // Doesn't actually work yet
++ case MMAL_ENCODING_NV12:
++ return MMAL_FOURCC('N','V','1','2');
++ case MMAL_ENCODING_NV21:
++ return MMAL_FOURCC('N','V','2','1');
++ case MMAL_ENCODING_RGB16:
++ return MMAL_FOURCC('R','G','1','6');
++ case MMAL_ENCODING_RGB24:
++ return MMAL_FOURCC('B','G','2','4');
++ case MMAL_ENCODING_BGR24:
++ return MMAL_FOURCC('R','G','2','4');
++ case MMAL_ENCODING_BGR32:
++ case MMAL_ENCODING_BGRA:
++ return MMAL_FOURCC('X','R','2','4');
++ case MMAL_ENCODING_RGB32:
++ case MMAL_ENCODING_RGBA:
++ return MMAL_FOURCC('X','B','2','4');
++ default:
++ break;
++ }
++ return 0;
++}
++
++typedef struct tex_context_s {
++ picture_context_t cmn;
++ GLuint texture;
++
++ PFNGLDELETETEXTURESPROC DeleteTextures; // Copy fn pointer so we don't need tc on delete
++} tex_context_t;
++
++static void tex_context_delete(tex_context_t * const tex)
++{
++ tex->DeleteTextures(1, &tex->texture);
++ free(tex);
++}
++
++static void tex_context_destroy(picture_context_t * pic_ctx)
++{
++ tex_context_delete((tex_context_t *)pic_ctx);
++}
++
++static picture_context_t * tex_context_copy(picture_context_t * pic_ctx)
++{
++ return pic_ctx;
++}
++
++static tex_context_t * get_tex_context(const opengl_tex_converter_t * const tc, picture_t * const pic, cma_buf_t * const cb)
++{
++ mmal_gl_converter_t * const sys = tc->priv;
++ tex_context_t * tex = (tex_context_t *)cma_buf_context2(cb);
++ if (tex != NULL)
++ return tex;
++
++ if ((tex = malloc(sizeof(*tex))) == NULL)
++ return NULL;
++
++ *tex = (tex_context_t){
++ .cmn = {
++ .destroy = tex_context_destroy,
++ .copy = tex_context_copy
++ },
++ .texture = 0,
++ .DeleteTextures = tc->vt->DeleteTextures
++ };
++
++ {
++ EGLint attribs[30];
++ EGLint * a = attribs;
++ const int fd = cma_buf_fd(cb);
++ uint8_t * base_addr = cma_buf_addr(cb);
++
++ if (pic->i_planes >= 4 || pic->i_planes <= 0)
++ {
++ msg_Err(tc, "%s: Bad planes: %d", __func__, pic->i_planes);
++ goto fail;
++ }
++
++ *a++ = EGL_WIDTH;
++ *a++ = pic->format.i_visible_width;
++ *a++ = EGL_HEIGHT;
++ *a++ = pic->format.i_visible_height;
++ *a++ = EGL_LINUX_DRM_FOURCC_EXT;
++ *a++ = sys->drm_fourcc;
++
++ if (pic->format.i_chroma == VLC_CODEC_MMAL_ZC_SAND8)
++ {
++ // Sand is its own very special bunny :-(
++ static const EGLint attnames[] = {
++ EGL_DMA_BUF_PLANE0_FD_EXT,
++ EGL_DMA_BUF_PLANE0_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE0_PITCH_EXT,
++ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
++ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
++ EGL_DMA_BUF_PLANE1_FD_EXT,
++ EGL_DMA_BUF_PLANE1_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE1_PITCH_EXT,
++ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
++ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT
++ };
++
++ const EGLint * n = attnames;
++
++ for (int i = 0; i < pic->i_planes; ++i)
++ {
++ const uint64_t mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(pic->p[i].i_pitch >> 7);
++
++ *a++ = *n++;
++ *a++ = fd;
++ *a++ = *n++;
++ *a++ = pic->p[i].p_pixels - base_addr;
++ *a++ = *n++;
++ *a++ = pic->format.i_width;
++ *a++ = *n++;
++ *a++ = (EGLint)(mod >> 32);
++ *a++ = *n++;
++ *a++ = (EGLint)(mod & 0xffffffff);
++ }
++ }
++ else
++ {
++ static const EGLint attnames[] = {
++ EGL_DMA_BUF_PLANE0_FD_EXT,
++ EGL_DMA_BUF_PLANE0_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE0_PITCH_EXT,
++ EGL_DMA_BUF_PLANE1_FD_EXT,
++ EGL_DMA_BUF_PLANE1_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE1_PITCH_EXT,
++ EGL_DMA_BUF_PLANE2_FD_EXT,
++ EGL_DMA_BUF_PLANE2_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE2_PITCH_EXT,
++ EGL_DMA_BUF_PLANE3_FD_EXT,
++ EGL_DMA_BUF_PLANE3_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE3_PITCH_EXT
++ };
++
++ const EGLint * n = attnames;
++
++ for (int i = 0; i < pic->i_planes; ++i)
++ {
++ *a++ = *n++;
++ *a++ = fd;
++ *a++ = *n++;
++ *a++ = pic->p[i].p_pixels - base_addr;
++ *a++ = *n++;
++ *a++ = pic->p[i].i_pitch;
++ }
++ }
++
++ *a = EGL_NONE;
++
++ const EGLImage image = tc->gl->egl.createImageKHR(tc->gl, EGL_LINUX_DMA_BUF_EXT, NULL, attribs);
++ if (!image) {
++ msg_Err(tc, "Failed to import fd %d: Err=%#x", fd, tc->vt->GetError());
++ goto fail;
++ }
++
++ // ** ?? tc->tex_target
++ tc->vt->GenTextures(1, &tex->texture);
++ tc->vt->BindTexture(GL_TEXTURE_EXTERNAL_OES, tex->texture);
++ tc->vt->TexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
++ tc->vt->TexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
++ sys->glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
++
++ tc->gl->egl.destroyImageKHR(tc->gl, image);
++ }
++
++ if (cma_buf_add_context2(cb, &tex->cmn) != VLC_SUCCESS)
++ {
++ msg_Err(tc, "%s: add_context2 failed", __func__);
++ goto fail;
++ }
++ return tex;
++
++fail:
++ tex_context_delete(tex);
++ return NULL;
++}
++
++
++static int
++tc_mmal_update(const opengl_tex_converter_t *tc, GLuint *textures,
++ const GLsizei *tex_width, const GLsizei *tex_height,
++ picture_t *pic, const size_t *plane_offset)
++{
++ mmal_gl_converter_t * const sys = tc->priv;
++#if TRACE_ALL
++ {
++ char cbuf[5];
++ msg_Dbg(tc, "%s: %s %d*%dx%d : %d*%dx%d", __func__,
++ str_fourcc(cbuf, pic->format.i_chroma),
++ tc->tex_count, tex_width[0], tex_height[0], pic->i_planes, pic->p[0].i_pitch, pic->p[0].i_lines);
++ }
++#endif
++ VLC_UNUSED(tex_width);
++ VLC_UNUSED(tex_height);
++ VLC_UNUSED(plane_offset);
++
++ if (!is_cma_buf_pic_chroma(pic->format.i_chroma))
++ {
++ char cbuf[5];
++ msg_Err(tc, "Pic with unexpected chroma: %s", str_fourcc(cbuf, pic->format.i_chroma));
++ return VLC_EGENERIC;
++ }
++
++ cma_buf_t * const cb = cma_buf_pic_get(pic);
++ if (cb == NULL)
++ {
++ msg_Err(tc, "Pic missing cma buf");
++ return VLC_EGENERIC;
++ }
++
++ tex_context_t * const tex = get_tex_context(tc, pic, cb);
++ if (tex == NULL)
++ return VLC_EGENERIC;
++
++// tc->vt->BindTexture(GL_TEXTURE_EXTERNAL_OES, tex->texture);
++
++ cma_buf_unref(sys->last_cb);
++ sys->last_cb = cma_buf_ref(cb);
++
++ textures[0] = tex->texture;
++ return VLC_SUCCESS;
++}
++
++static int
++tc_mmal_fetch_locations(opengl_tex_converter_t *tc, GLuint program)
++{
++ tc->uloc.Texture[0] = tc->vt->GetUniformLocation(program, "Texture0");
++ return tc->uloc.Texture[0] != -1 ? VLC_SUCCESS : VLC_EGENERIC;
++}
++
++static void
++tc_mmal_prepare_shader(const opengl_tex_converter_t *tc,
++ const GLsizei *tex_width, const GLsizei *tex_height,
++ float alpha)
++{
++ (void) tex_width; (void) tex_height; (void) alpha;
++ VLC_UNUSED(tc);
++// tc->vt->Uniform1i(tc->uloc.Texture[0], 0);
++}
++
++static GLuint
++tc_fragment_shader_init(opengl_tex_converter_t * const tc, const GLenum tex_target,
++ const vlc_fourcc_t chroma, const video_color_space_t yuv_space)
++{
++ VLC_UNUSED(yuv_space);
++
++ tc->tex_count = 1;
++ tc->tex_target = tex_target;
++ tc->texs[0] = (struct opengl_tex_cfg) {
++ { 1, 1 }, { 1, 1 }, GL_RGB, chroma, GL_UNSIGNED_SHORT //** ??
++ };
++
++ tc->pf_fetch_locations = tc_mmal_fetch_locations;
++ tc->pf_prepare_shader = tc_mmal_prepare_shader;
++
++
++ const char fs[] =
++ "#extension GL_OES_EGL_image_external : enable\n"
++ "precision mediump float;\n"
++ "uniform samplerExternalOES Texture0;\n"
++ "varying vec2 TexCoord0;\n"
++ "void main() {\n"
++ " gl_FragColor = texture2D(Texture0, TexCoord0);\n"
++ "}\n";
++
++
++ const char *code = fs;
++
++ GLuint fragment_shader = tc->vt->CreateShader(GL_FRAGMENT_SHADER);
++ tc->vt->ShaderSource(fragment_shader, 1, &code, NULL);
++ tc->vt->CompileShader(fragment_shader);
++ return fragment_shader;
++}
++
++
++static void
++CloseGLConverter(vlc_object_t *obj)
++{
++ opengl_tex_converter_t * const tc = (opengl_tex_converter_t *)obj;
++ mmal_gl_converter_t * const sys = tc->priv;
++
++ if (sys == NULL)
++ return;
++
++ cma_buf_unref(sys->last_cb);
++ cma_vcsm_exit(sys->vcsm_init_type);
++ free(sys);
++}
++
++
++// Pick a chroma that we can convert to
++// Prefer I420 as smallest
++static vlc_fourcc_t chroma_in_out(const vlc_fourcc_t chroma_in)
++{
++ switch (chroma_in)
++ {
++ case VLC_CODEC_MMAL_OPAQUE:
++ case VLC_CODEC_MMAL_ZC_I420:
++ case VLC_CODEC_MMAL_ZC_SAND8:
++ case VLC_CODEC_MMAL_ZC_SAND10: // ISP only
++ return VLC_CODEC_MMAL_ZC_I420;
++ case VLC_CODEC_MMAL_ZC_SAND30: // HVS only
++ case VLC_CODEC_MMAL_ZC_RGB32:
++ return VLC_CODEC_MMAL_ZC_RGB32; // HVS can't generate YUV of any sort
++ default:
++ break;
++ }
++ return 0;
++}
++
++
++static int
++OpenGLConverter(vlc_object_t *obj)
++{
++ opengl_tex_converter_t * const tc = (opengl_tex_converter_t *)obj;
++ int rv = VLC_EGENERIC;
++ const EGLint eglfmt = vlc_to_gl_fourcc(&tc->fmt);
++ const vlc_fourcc_t chroma_out = chroma_in_out(tc->fmt.i_chroma);
++
++ // Do we know what to do with this?
++ if (chroma_out == 0)
++ return rv;
++
++ {
++ char dbuf0[5], dbuf1[5], dbuf2[5];
++ msg_Dbg(tc, "<<< %s: V:%s/E:%s,%dx%d [(%d,%d) %d/%d] sar:%d/%d -> %s", __func__,
++ str_fourcc(dbuf0, tc->fmt.i_chroma),
++ str_fourcc(dbuf1, eglfmt),
++ tc->fmt.i_width, tc->fmt.i_height,
++ tc->fmt.i_x_offset, tc->fmt.i_y_offset,
++ tc->fmt.i_visible_width, tc->fmt.i_visible_height,
++ tc->fmt.i_sar_num, tc->fmt.i_sar_den,
++ str_fourcc(dbuf2, chroma_out));
++ }
++
++ if (tc->gl->ext != VLC_GL_EXT_EGL ||
++ !tc->gl->egl.createImageKHR || !tc->gl->egl.destroyImageKHR)
++ {
++ // Missing an important callback
++ msg_Dbg(tc, "Missing EGL xxxImageKHR calls");
++ return rv;
++ }
++
++ if ((tc->priv = calloc(1, sizeof(mmal_gl_converter_t))) == NULL)
++ {
++ msg_Err(tc, "priv alloc failure");
++ rv = VLC_ENOMEM;
++ goto fail;
++ }
++ mmal_gl_converter_t * const sys = tc->priv;
++
++ sys->drm_fourcc = eglfmt;
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) != VCSM_INIT_CMA) {
++ msg_Dbg(tc, "VCSM init failed");
++ goto fail;
++ }
++
++ if ((sys->glEGLImageTargetTexture2DOES = vlc_gl_GetProcAddress(tc->gl, "glEGLImageTargetTexture2DOES")) == NULL)
++ {
++ msg_Err(tc, "Failed to bind GL fns");
++ goto fail;
++ }
++
++ if ((tc->fshader = tc_fragment_shader_init(tc, GL_TEXTURE_EXTERNAL_OES,
++ eglfmt == 0 ? VLC_CODEC_RGB32 : tc->fmt.i_chroma,
++ eglfmt == 0 ? COLOR_SPACE_SRGB : tc->fmt.space)) == 0)
++ {
++ msg_Err(tc, "Failed to make shader");
++ goto fail;
++ }
++
++ if (eglfmt == 0)
++ {
++ tc->fmt.i_chroma = chroma_out;
++ tc->fmt.i_bits_per_pixel = 8;
++ if (tc->fmt.i_chroma == VLC_CODEC_MMAL_ZC_RGB32)
++ {
++ tc->fmt.i_rmask = 0xff0000;
++ tc->fmt.i_gmask = 0xff00;
++ tc->fmt.i_bmask = 0xff;
++ tc->fmt.space = COLOR_SPACE_SRGB;
++ }
++ else
++ {
++ tc->fmt.i_rmask = 0;
++ tc->fmt.i_gmask = 0;
++ tc->fmt.i_bmask = 0;
++ tc->fmt.space = COLOR_SPACE_UNDEF;
++ }
++ sys->drm_fourcc = vlc_to_gl_fourcc(&tc->fmt);
++ }
++
++ tc->handle_texs_gen = true; // We manage the texs
++ tc->pf_update = tc_mmal_update;
++
++#if TRACE_ALL
++ {
++ char dbuf0[5], dbuf1[5], dbuf2[5];
++ msg_Dbg(tc, ">>> %s: V:%s/E:%s,%dx%d [(%d,%d) %d/%d] sar:%d/%d -> %s", __func__,
++ str_fourcc(dbuf0, tc->fmt.i_chroma),
++ str_fourcc(dbuf1, sys->drm_fourcc),
++ tc->fmt.i_width, tc->fmt.i_height,
++ tc->fmt.i_x_offset, tc->fmt.i_y_offset,
++ tc->fmt.i_visible_width, tc->fmt.i_visible_height,
++ tc->fmt.i_sar_num, tc->fmt.i_sar_den,
++ str_fourcc(dbuf2, chroma_out));
++ }
++#endif
++
++ return VLC_SUCCESS;
++
++fail:
++ CloseGLConverter(obj);
++ return rv;
++}
++
++vlc_module_begin ()
++ set_description("MMAL OpenGL surface converter")
++ set_shortname (N_("MMALGLConverter"))
++ set_capability("glconv", 900)
++ set_callbacks(OpenGLConverter, CloseGLConverter)
++ set_category(CAT_VIDEO)
++ set_subcategory(SUBCAT_VIDEO_VOUT)
++ add_shortcut("mmal_gl_converter")
++vlc_module_end ()
++
+--- a/modules/hw/mmal/deinterlace.c
++++ b/modules/hw/mmal/deinterlace.c
+@@ -26,11 +26,12 @@
+ #include "config.h"
+ #endif
+
+-#include <vlc_picture_pool.h>
++#include <stdatomic.h>
++
+ #include <vlc_common.h>
++#include <vlc_picture_pool.h>
+ #include <vlc_plugin.h>
+ #include <vlc_filter.h>
+-#include <vlc_atomic.h>
+
+ #include "mmal_picture.h"
+
+@@ -39,468 +40,814 @@
+ #include <interface/mmal/util/mmal_util.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+
+-#define MIN_NUM_BUFFERS_IN_TRANSIT 2
++#define MMAL_DEINTERLACE_NO_QPU "mmal-deinterlace-no-qpu"
++#define MMAL_DEINTERLACE_NO_QPU_TEXT N_("Do not use QPUs for advanced HD deinterlacing.")
++#define MMAL_DEINTERLACE_NO_QPU_LONGTEXT N_("Do not make use of the QPUs to allow higher quality deinterlacing of HD content.")
+
+-#define MMAL_DEINTERLACE_QPU "mmal-deinterlace-adv-qpu"
+-#define MMAL_DEINTERLACE_QPU_TEXT N_("Use QPUs for advanced HD deinterlacing.")
+-#define MMAL_DEINTERLACE_QPU_LONGTEXT N_("Make use of the QPUs to allow higher quality deinterlacing of HD content.")
++#define MMAL_DEINTERLACE_ADV "mmal-deinterlace-adv"
++#define MMAL_DEINTERLACE_ADV_TEXT N_("Force advanced deinterlace")
++#define MMAL_DEINTERLACE_ADV_LONGTEXT N_("Force advanced deinterlace")
+
+-static int Open(filter_t *filter);
+-static void Close(filter_t *filter);
++#define MMAL_DEINTERLACE_FAST "mmal-deinterlace-fast"
++#define MMAL_DEINTERLACE_FAST_TEXT N_("Force fast deinterlace")
++#define MMAL_DEINTERLACE_FAST_LONGTEXT N_("Force fast deinterlace")
+
+-vlc_module_begin()
+- set_shortname(N_("MMAL deinterlace"))
+- set_description(N_("MMAL-based deinterlace filter plugin"))
+- set_capability("video filter", 0)
+- set_category(CAT_VIDEO)
+- set_subcategory(SUBCAT_VIDEO_VFILTER)
+- set_callbacks(Open, Close)
+- add_shortcut("deinterlace")
+- add_bool(MMAL_DEINTERLACE_QPU, false, MMAL_DEINTERLACE_QPU_TEXT,
+- MMAL_DEINTERLACE_QPU_LONGTEXT, true);
+-vlc_module_end()
++#define MMAL_DEINTERLACE_NONE "mmal-deinterlace-none"
++#define MMAL_DEINTERLACE_NONE_TEXT N_("Force no deinterlace")
++#define MMAL_DEINTERLACE_NONE_LONGTEXT N_("Force no interlace. Simply strips off the interlace markers and passes the frame straight through. "\
++ "This is the default for > SD if < 96M gpu-mem")
++
++#define MMAL_DEINTERLACE_HALF_RATE "mmal-deinterlace-half-rate"
++#define MMAL_DEINTERLACE_HALF_RATE_TEXT N_("Halve output framerate")
++#define MMAL_DEINTERLACE_HALF_RATE_LONGTEXT N_("Halve output framerate. 1 output frame for each pair of interlaced fields input")
++
++#define MMAL_DEINTERLACE_FULL_RATE "mmal-deinterlace-full-rate"
++#define MMAL_DEINTERLACE_FULL_RATE_TEXT N_("Full output framerate")
++#define MMAL_DEINTERLACE_FULL_RATE_LONGTEXT N_("Full output framerate. 1 output frame for each interlaced field input")
+
+-struct filter_sys_t {
++
++typedef struct filter_sys_t
++{
+ MMAL_COMPONENT_T *component;
+ MMAL_PORT_T *input;
+ MMAL_PORT_T *output;
++ MMAL_POOL_T *in_pool;
++
++ MMAL_QUEUE_T * out_q;
++
++ // Bind this lot somehow into ppr????
++ bool is_cma;
++ cma_buf_pool_t * cma_out_pool;
++ MMAL_POOL_T * out_pool;
++
++ hw_mmal_port_pool_ref_t *out_ppr;
++
++ bool half_rate;
++ bool use_qpu;
++ bool use_fast;
++ bool use_passthrough;
++ unsigned int seq_in; // Seq of next frame to submit (1-15) [Init=1]
++ unsigned int seq_out; // Seq of last frame received (1-15) [Init=15]
+
+- MMAL_QUEUE_T *filtered_pictures;
+- vlc_sem_t sem;
++ vcsm_init_type_t vcsm_init_type;
+
+- atomic_bool started;
++} filter_sys_t;
+
+- /* statistics */
+- int output_in_transit;
+- int input_in_transit;
+-};
+-
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static picture_t *deinterlace(filter_t *filter, picture_t *picture);
+-static void flush(filter_t *filter);
+
+ #define MMAL_COMPONENT_DEFAULT_DEINTERLACE "vc.ril.image_fx"
+
+-static int Open(filter_t *filter)
++#define TRACE_ALL 0
++
++
++
++// Buffer attached to pic on success, is still valid on failure
++static picture_t * di_alloc_opaque(filter_t * const p_filter, MMAL_BUFFER_HEADER_T * const buf)
+ {
+- int32_t frame_duration = filter->fmt_in.video.i_frame_rate != 0 ?
+- (int64_t)1000000 * filter->fmt_in.video.i_frame_rate_base /
+- filter->fmt_in.video.i_frame_rate : 0;
+- bool use_qpu = var_InheritBool(filter, MMAL_DEINTERLACE_QPU);
++ filter_sys_t *const filter_sys = p_filter->p_sys;
++ picture_t * const pic = filter_NewPicture(p_filter);
+
+- MMAL_PARAMETER_IMAGEFX_PARAMETERS_T imfx_param = {
+- { MMAL_PARAMETER_IMAGE_EFFECT_PARAMETERS, sizeof(imfx_param) },
+- MMAL_PARAM_IMAGEFX_DEINTERLACE_ADV,
+- 4,
+- { 3, frame_duration, 0, use_qpu }
+- };
++ if (pic == NULL)
++ goto fail1;
+
+- int ret = VLC_SUCCESS;
+- MMAL_STATUS_T status;
+- filter_sys_t *sys;
++ if (buf->length == 0) {
++ msg_Err(p_filter, "%s: Empty buffer", __func__);
++ goto fail2;
++ }
+
+- msg_Dbg(filter, "Try to open mmal_deinterlace filter. frame_duration: %d, QPU %s!",
+- frame_duration, use_qpu ? "used" : "unused");
++ if ((pic->context = hw_mmal_gen_context(buf, filter_sys->out_ppr)) == NULL)
++ goto fail2;
+
+- if (filter->fmt_in.video.i_chroma != VLC_CODEC_MMAL_OPAQUE)
+- return VLC_EGENERIC;
++ buf_to_pic_copy_props(pic, buf);
+
+- if (filter->fmt_out.video.i_chroma != VLC_CODEC_MMAL_OPAQUE)
+- return VLC_EGENERIC;
++#if TRACE_ALL
++ msg_Dbg(p_filter, "pic: prog=%d, tff=%d, date=%lld", pic->b_progressive, pic->b_top_field_first, (long long)pic->date);
++#endif
+
+- sys = calloc(1, sizeof(filter_sys_t));
+- if (!sys)
+- return VLC_ENOMEM;
+- filter->p_sys = sys;
++ return pic;
+
+- bcm_host_init();
++fail2:
++ picture_Release(pic);
++fail1:
++// mmal_buffer_header_release(buf);
++ return NULL;
++}
+
+- status = mmal_component_create(MMAL_COMPONENT_DEFAULT_DEINTERLACE, &sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
+- MMAL_COMPONENT_DEFAULT_DEINTERLACE, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++static void di_input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++#if TRACE_ALL
++ pic_ctx_mmal_t * ctx = buffer->user_data;
++// filter_sys_t *const sys = ((filter_t *)port->userdata)->p_sys;
++
++ msg_Dbg((filter_t *)port->userdata, "<<< %s: cmd=%d, ctx=%p, buf=%p, flags=%#x, pts=%lld", __func__, buffer->cmd, ctx, buffer,
++ buffer->flags, (long long)buffer->pts);
++#else
++ VLC_UNUSED(port);
++#endif
+
+- status = mmal_port_parameter_set(sys->component->output[0], &imfx_param.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to configure MMAL component %s (status=%"PRIx32" %s)",
+- MMAL_COMPONENT_DEFAULT_DEINTERLACE, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++ mmal_buffer_header_release(buffer);
+
+- sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
+- status = mmal_port_enable(sys->component->control, control_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to enable control port %s (status=%"PRIx32" %s)",
+- sys->component->control->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++#if TRACE_ALL
++ msg_Dbg((filter_t *)port->userdata, ">>> %s", __func__);
++#endif
++}
++
++static void di_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++ if (buf->cmd == 0 && buf->length != 0)
++ {
++ // The filter structure etc. should always exist if we have contents
++ // but might not on later flushes as we shut down
++ filter_t * const p_filter = (filter_t *)port->userdata;
++ filter_sys_t * const sys = p_filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s: cmd=%d; flags=%#x, pts=%lld", __func__, buf->cmd, buf->flags, (long long) buf->pts);
++#endif
++ mmal_queue_put(sys->out_q, buf);
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: out Q len=%d", __func__, mmal_queue_length(sys->out_q));
++#endif
++ return;
+ }
+
+- sys->input = sys->component->input[0];
+- sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
+- if (filter->fmt_in.i_codec == VLC_CODEC_MMAL_OPAQUE)
+- sys->input->format->encoding = MMAL_ENCODING_OPAQUE;
+- sys->input->format->es->video.width = filter->fmt_in.video.i_width;
+- sys->input->format->es->video.height = filter->fmt_in.video.i_height;
+- sys->input->format->es->video.crop.x = 0;
+- sys->input->format->es->video.crop.y = 0;
+- sys->input->format->es->video.crop.width = filter->fmt_in.video.i_width;
+- sys->input->format->es->video.crop.height = filter->fmt_in.video.i_height;
+- sys->input->format->es->video.par.num = filter->fmt_in.video.i_sar_num;
+- sys->input->format->es->video.par.den = filter->fmt_in.video.i_sar_den;
++ mmal_buffer_header_reset(buf); // User data stays intact so release will kill pic
++ mmal_buffer_header_release(buf);
++}
+
+- es_format_Copy(&filter->fmt_out, &filter->fmt_in);
+- filter->fmt_out.video.i_frame_rate *= 2;
+
+- status = mmal_port_format_commit(sys->input);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
+- sys->input->buffer_size = sys->input->buffer_size_recommended;
+- sys->input->buffer_num = sys->input->buffer_num_recommended;
+
+- if (filter->fmt_in.i_codec == VLC_CODEC_MMAL_OPAQUE) {
+- MMAL_PARAMETER_BOOLEAN_T zero_copy = {
+- { MMAL_PARAMETER_ZERO_COPY, sizeof(MMAL_PARAMETER_BOOLEAN_T) },
+- 1
+- };
++static MMAL_STATUS_T fill_output_from_q(filter_t * const p_filter, filter_sys_t * const sys, MMAL_QUEUE_T * const q)
++{
++ MMAL_BUFFER_HEADER_T * out_buf;
+
+- status = mmal_port_parameter_set(sys->input, &zero_copy.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- goto out;
++ while ((out_buf = mmal_queue_get(q)) != NULL)
++ {
++ MMAL_STATUS_T err;
++ if ((err = mmal_port_send_buffer(sys->output, out_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to output failed");
++ mmal_queue_put_back(q, out_buf);
++ return err;
+ }
+ }
++ return MMAL_SUCCESS;
++}
+
+- status = mmal_port_enable(sys->input, input_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to enable input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
+- }
++// Output buffers may contain a pic ref on error or flush
++// Free it
++static MMAL_BOOL_T out_buffer_pre_release_cb(MMAL_BUFFER_HEADER_T *header, void *userdata)
++{
++ VLC_UNUSED(userdata);
+
+- sys->output = sys->component->output[0];
+- sys->output->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
+- mmal_format_full_copy(sys->output->format, sys->input->format);
++ cma_buf_t * const cb = header->user_data;
++ header->user_data = NULL;
++ cma_buf_unref(cb); // Copes fine with NULL
+
+- status = mmal_port_format_commit(sys->output);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to commit format for output port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ return MMAL_FALSE;
++}
++
++static inline unsigned int seq_inc(unsigned int x)
++{
++ return x + 1 >= 16 ? 1 : x + 1;
++}
++
++static inline unsigned int seq_delta(unsigned int sseq, unsigned int fseq)
++{
++ return fseq == 0 ? 0 : fseq <= sseq ? sseq - fseq : 15 - (fseq - sseq);
++}
++
++static picture_t *deinterlace(filter_t * p_filter, picture_t * p_pic)
++{
++ filter_sys_t * const sys = p_filter->p_sys;
++ picture_t *ret_pics = NULL;
++ MMAL_STATUS_T err;
++ MMAL_BUFFER_HEADER_T * out_buf = NULL;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s", __func__);
++#endif
++
++ if (hw_mmal_vlc_pic_to_mmal_fmt_update(sys->input->format, p_pic))
++ {
++ // ****** Breaks on opaque (at least)
++
++ if (sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++#if 0
++ if (sys->output->is_enabled)
++ mmal_port_disable(sys->output);
++
++ mmal_format_full_copy(sys->output->format, sys->input->format);
++ mmal_port_format_commit(sys->output);
++ sys->output->buffer_num = 30;
++ sys->output->buffer_size = sys->input->buffer_size_recommended;
++ mmal_port_enable(sys->output, di_output_port_cb);
++#endif
++ if (mmal_port_format_commit(sys->input) != MMAL_SUCCESS)
++ msg_Err(p_filter, "Failed to update pic format");
++ sys->input->buffer_num = 30;
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++ mmal_log_dump_format(sys->input->format);
++ }
++
++ // Reenable stuff if the last thing we did was flush
++ // Output should always be enabled
++ if (!sys->input->is_enabled &&
++ (err = mmal_port_enable(sys->input, di_input_port_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Input port reenable failed");
++ goto fail;
++ }
++
++ if (!sys->is_cma)
++ {
++ // Fill output from anything that has turned up in pool Q
++ if (hw_mmal_port_pool_ref_fill(sys->out_ppr) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Out port fill fail");
++ goto fail;
++ }
+ }
++ else
++ {
++ // We are expecting one in - one out so simply wedge a new bufer
++ // into the output port. Flow control will happen on cma alloc.
++
++ if ((out_buf = mmal_queue_get(sys->out_pool->queue)) == NULL)
++ {
++ // Should never happen
++ msg_Err(p_filter, "Failed to get output buffer");
++ goto fail;
++ }
++ mmal_buffer_header_reset(out_buf);
+
+- sys->output->buffer_num = 3;
++ // Attach cma_buf to the buffer & ensure it is freed when the buffer is released
++ // On a good send callback the pic will be extracted to avoid this
++ mmal_buffer_header_pre_release_cb_set(out_buf, out_buffer_pre_release_cb, p_filter);
++
++ cma_buf_t * const cb = cma_buf_pool_alloc_buf(sys->cma_out_pool, sys->output->buffer_size);
++ if ((out_buf->user_data = cb) == NULL) // Check & attach cb to buf
++ {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to alloc CMA buf: fmt=%s, size=%d",
++ str_fourcc(dbuf0, p_pic->format.i_chroma),
++ sys->output->buffer_size);
++ goto fail;
++ }
++ const unsigned int vc_h = cma_buf_vc_handle(cb); // Cannot coerce without going via variable
++ out_buf->data = (uint8_t *)vc_h;
++ out_buf->alloc_size = sys->output->buffer_size;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "Out buf send: pic=%p, data=%p, user=%p, flags=%#x, len=%d/%d, pts=%lld",
++ p_pic, out_buf->data, out_buf->user_data, out_buf->flags,
++ out_buf->length, out_buf->alloc_size, (long long)out_buf->pts);
++#endif
+
+- if (filter->fmt_in.i_codec == VLC_CODEC_MMAL_OPAQUE) {
+- MMAL_PARAMETER_UINT32_T extra_buffers = {
+- { MMAL_PARAMETER_EXTRA_BUFFERS, sizeof(MMAL_PARAMETER_UINT32_T) },
+- 5
+- };
+- status = mmal_port_parameter_set(sys->output, &extra_buffers.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to set MMAL_PARAMETER_EXTRA_BUFFERS on output port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- goto out;
++ if ((err = mmal_port_send_buffer(sys->output, out_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to output failed");
++ goto fail;
+ }
++ out_buf = NULL;
++ }
+
+- MMAL_PARAMETER_BOOLEAN_T zero_copy = {
+- { MMAL_PARAMETER_ZERO_COPY, sizeof(MMAL_PARAMETER_BOOLEAN_T) },
+- 1
+- };
++ // Stuff into input
++ // We assume the BH is already set up with values reflecting pic date etc.
++ {
++ MMAL_BUFFER_HEADER_T * const pic_buf = hw_mmal_pic_buf_replicated(p_pic, sys->in_pool);
++
++ if (pic_buf == NULL)
++ {
++ msg_Err(p_filter, "Pic has not attached buffer");
++ goto fail;
++ }
+
+- status = mmal_port_parameter_set(sys->output, &zero_copy.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
+- sys->output->name, status, mmal_status_to_string(status));
+- goto out;
++ picture_Release(p_pic);
++
++ // Add a sequence to the flags so we can track what we have actually
++ // deinterlaced
++ pic_buf->flags = (pic_buf->flags & ~(0xfU * MMAL_BUFFER_HEADER_FLAG_USER0)) | (sys->seq_in * (MMAL_BUFFER_HEADER_FLAG_USER0));
++ sys->seq_in = seq_inc(sys->seq_in);
++
++ if ((err = mmal_port_send_buffer(sys->input, pic_buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to input failed");
++ mmal_buffer_header_release(pic_buf);
++ goto fail;
+ }
+ }
+
+- status = mmal_port_enable(sys->output, output_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to enable output port %s (status=%"PRIx32" %s)",
+- sys->output->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ // Return anything that is in the out Q
++ {
++ picture_t ** pp_pic = &ret_pics;
++
++ // Advanced di has a 3 frame latency, so if the seq delta is greater
++ // than that then we are expecting at least two frames of output. Wait
++ // for one of those.
++ // seq_in is seq of the next frame we are going to submit (1-15, no 0)
++ // seq_out is last frame we removed from Q
++ // So after 4 frames sent (1st time we want to wait), 0 rx seq_in=5, seq_out=15, delta=5
++
++ while ((out_buf = (seq_delta(sys->seq_in, sys->seq_out) >= 5 ? mmal_queue_timedwait(sys->out_q, 1000) : mmal_queue_get(sys->out_q))) != NULL)
++ {
++ const unsigned int seq_out = (out_buf->flags / MMAL_BUFFER_HEADER_FLAG_USER0) & 0xf;
++ int rv;
++
++ picture_t * out_pic;
++
++ if (sys->is_cma)
++ {
++ // Alloc pic
++ if ((out_pic = filter_NewPicture(p_filter)) == NULL)
++ {
++ // Can't alloc pic - just stop extraction
++ mmal_queue_put_back(sys->out_q, out_buf);
++ out_buf = NULL;
++ msg_Warn(p_filter, "Failed to alloc new filter output pic");
++ break;
++ }
++
++ // Extract cma_buf from buf & attach to pic
++ cma_buf_t * const cb = (cma_buf_t *)out_buf->user_data;
++ if ((rv = cma_buf_pic_attach(cb, out_pic)) != VLC_SUCCESS)
++ {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to attach CMA to pic: fmt=%s err=%d",
++ str_fourcc(dbuf0, out_pic->format.i_chroma),
++ rv);
++ // cb still attached to buffer and will be freed with it
++ goto fail;
++ }
++ out_buf->user_data = NULL;
++
++ buf_to_pic_copy_props(out_pic, out_buf);
++
++ // Set pic data pointers from buf aux info now it has it
++ if ((rv = cma_pic_set_data(out_pic, sys->output->format, out_buf)) != VLC_SUCCESS)
++ {
++ char dbuf0[5];
++ msg_Err(p_filter, "Failed to set data: fmt=%s, rv=%d",
++ str_fourcc(dbuf0, sys->output->format->encoding),
++ rv);
++ }
++
++ out_buf->user_data = NULL; // Responsability for this pic no longer with buffer
++ mmal_buffer_header_release(out_buf);
++ }
++ else
++ {
++ out_pic = di_alloc_opaque(p_filter, out_buf);
++
++ if (out_pic == NULL) {
++ msg_Warn(p_filter, "Failed to alloc new filter output pic");
++ mmal_queue_put_back(sys->out_q, out_buf); // Wedge buf back into Q in the hope we can alloc a pic later
++ out_buf = NULL;
++ break;
++ }
++ }
++ out_buf = NULL; // Now attached to pic or recycled
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "-- %s: Q pic=%p: seq_in=%d, seq_out=%d, delta=%d", __func__, out_pic, sys->seq_in, seq_out, seq_delta(sys->seq_in, seq_out));
++#endif
++
++ *pp_pic = out_pic;
++ pp_pic = &out_pic->p_next;
++
++ // Ignore 0 seqs
++ // Don't think these should actually happen
++ if (seq_out != 0)
++ sys->seq_out = seq_out;
++ }
++
++ // Crash on lockup
++ assert(ret_pics != NULL || seq_delta(sys->seq_in, sys->seq_out) < 5);
+ }
+
+- status = mmal_component_enable(sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to enable component %s (status=%"PRIx32" %s)",
+- sys->component->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s: pic=%p", __func__, ret_pics);
++#endif
++
++ return ret_pics;
++
++fail:
++ if (out_buf != NULL)
++ mmal_buffer_header_release(out_buf);
++ picture_Release(p_pic);
++ return NULL;
++}
++
++static void di_flush(filter_t *p_filter)
++{
++ filter_sys_t * const sys = p_filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "<<< %s", __func__);
++#endif
++
++ if (sys->input != NULL && sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++
++ if (sys->output != NULL && sys->output->is_enabled)
++ {
++ if (sys->is_cma)
++ {
++ MMAL_BUFFER_HEADER_T * buf;
++ mmal_port_disable(sys->output);
++ while ((buf = mmal_queue_get(sys->out_q)) != NULL)
++ mmal_buffer_header_release(buf);
++ }
++ else
++ {
++ // Wedge anything we've got into the output port as that will free the underlying buffers
++ fill_output_from_q(p_filter, sys, sys->out_q);
++
++ mmal_port_disable(sys->output);
++
++ // If that dumped anything real into the out_q then have another go
++ if (mmal_queue_length(sys->out_q) != 0)
++ {
++ mmal_port_enable(sys->output, di_output_port_cb);
++ fill_output_from_q(p_filter, sys, sys->out_q);
++ mmal_port_disable(sys->output);
++ // Out q should now be empty & should remain so until the input is reenabled
++ }
++ }
++ mmal_port_enable(sys->output, di_output_port_cb);
++
++ // Leaving the input disabled is fine - but we want to leave the output enabled
++ // so we can retrieve buffers that are still bound to pictures
+ }
+
+- sys->filtered_pictures = mmal_queue_create();
++ sys->seq_in = 1;
++ sys->seq_out = 15;
+
+- filter->pf_video_filter = deinterlace;
+- filter->pf_flush = flush;
++#if TRACE_ALL
++ msg_Dbg(p_filter, ">>> %s", __func__);
++#endif
++}
+
+- vlc_sem_init(&sys->sem, 0);
+
+-out:
+- if (ret != VLC_SUCCESS)
+- Close(filter);
++static void pass_flush(filter_t *p_filter)
++{
++ // Nothing to do
++ VLC_UNUSED(p_filter);
++}
+
+- return ret;
++static picture_t * pass_deinterlace(filter_t * p_filter, picture_t * p_pic)
++{
++ VLC_UNUSED(p_filter);
++
++ p_pic->b_progressive = true;
++ return p_pic;
+ }
+
+-static void Close(filter_t *filter)
++
++static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+ {
+- filter_sys_t *sys = filter->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
++ filter_t *filter = (filter_t *)port->userdata;
++ MMAL_STATUS_T status;
+
+- if (!sys)
++ if (buffer->cmd == MMAL_EVENT_ERROR) {
++ status = *(uint32_t *)buffer->data;
++ msg_Err(filter, "MMAL error %"PRIx32" \"%s\"", status,
++ mmal_status_to_string(status));
++ }
++
++ mmal_buffer_header_reset(buffer);
++ mmal_buffer_header_release(buffer);
++}
++
++static void CloseMmalDeinterlace(filter_t *filter)
++{
++ filter_sys_t * const sys = filter->p_sys;
++
++#if TRACE_ALL
++ msg_Dbg(filter, "<<< %s", __func__);
++#endif
++
++ if (sys == NULL)
+ return;
+
+- if (sys->component && sys->component->control->is_enabled)
+- mmal_port_disable(sys->component->control);
++ if (sys->use_passthrough)
++ {
++ free(sys);
++ return;
++ }
+
+- if (sys->input && sys->input->is_enabled)
+- mmal_port_disable(sys->input);
++ di_flush(filter);
+
+- if (sys->output && sys->output->is_enabled)
+- mmal_port_disable(sys->output);
++ if (sys->component && sys->component->control->is_enabled)
++ mmal_port_disable(sys->component->control);
+
+ if (sys->component && sys->component->is_enabled)
+ mmal_component_disable(sys->component);
+
+- while ((buffer = mmal_queue_get(sys->filtered_pictures))) {
+- picture_t *pic = (picture_t *)buffer->user_data;
+- picture_Release(pic);
++ if (sys->in_pool != NULL)
++ mmal_pool_destroy(sys->in_pool);
++
++ hw_mmal_port_pool_ref_release(sys->out_ppr, false);
++ // Once we exit filter & sys are invalid so mark as such
++ if (sys->output != NULL)
++ sys->output->userdata = NULL;
++
++ if (sys->is_cma)
++ {
++ if (sys->output && sys->output->is_enabled)
++ mmal_port_disable(sys->output);
++
++ cma_buf_pool_deletez(&sys->cma_out_pool);
++
++ if (sys->out_pool != NULL)
++ mmal_pool_destroy(sys->out_pool);
+ }
+
+- if (sys->filtered_pictures)
+- mmal_queue_destroy(sys->filtered_pictures);
++ if (sys->out_q != NULL)
++ mmal_queue_destroy(sys->out_q);
+
+ if (sys->component)
+ mmal_component_release(sys->component);
+
+- vlc_sem_destroy(&sys->sem);
++ cma_vcsm_exit(sys->vcsm_init_type);
++
+ free(sys);
++}
++
+
+- bcm_host_deinit();
++static bool is_fmt_valid_in(const vlc_fourcc_t fmt)
++{
++ return fmt == VLC_CODEC_MMAL_OPAQUE ||
++ fmt == VLC_CODEC_MMAL_ZC_I420 ||
++ fmt == VLC_CODEC_MMAL_ZC_SAND8;
+ }
+
+-static int send_output_buffer(filter_t *filter)
++static int OpenMmalDeinterlace(filter_t *filter)
+ {
+- filter_sys_t *sys = filter->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
++ int32_t frame_duration = filter->fmt_in.video.i_frame_rate != 0 ?
++ CLOCK_FREQ * filter->fmt_in.video.i_frame_rate_base /
++ filter->fmt_in.video.i_frame_rate : 0;
++
++ int ret = VLC_EGENERIC;
+ MMAL_STATUS_T status;
+- picture_t *picture;
+- int ret = 0;
++ filter_sys_t *sys;
++
++ msg_Dbg(filter, "<<< %s", __func__);
++
++ if (!is_fmt_valid_in(filter->fmt_in.video.i_chroma) ||
++ filter->fmt_out.video.i_chroma != filter->fmt_in.video.i_chroma)
++ return VLC_EGENERIC;
+
+- if (!sys->output->is_enabled) {
+- ret = VLC_EGENERIC;
+- goto out;
++ sys = calloc(1, sizeof(filter_sys_t));
++ if (!sys)
++ return VLC_ENOMEM;
++ filter->p_sys = sys;
++
++ sys->seq_in = 1;
++ sys->seq_out = 15;
++ sys->is_cma = is_cma_buf_pic_chroma(filter->fmt_out.video.i_chroma);
++
++ if ((sys->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE) {
++ msg_Err(filter, "VCSM init failed");
++ goto fail;
++ }
++
++ if (rpi_is_model_pi4())
++ {
++ sys->half_rate = true;
++ sys->use_qpu = false;
++ sys->use_fast = true;
++ }
++ else
++ {
++ sys->half_rate = false;
++ sys->use_qpu = true;
++ sys->use_fast = false;
++ }
++ sys->use_passthrough = false;
++
++ if (filter->fmt_in.video.i_width * filter->fmt_in.video.i_height > 768 * 576)
++ {
++ // We get stressed if we have to try too hard - so make life easier
++ sys->half_rate = true;
++ // Also check we actually have enough memory to do this
++ // Memory always comes from GPU if Opaque
++ // Assume we have plenty of memory if it comes from CMA
++ if ((!sys->is_cma || sys->vcsm_init_type == VCSM_INIT_LEGACY) &&
++ hw_mmal_get_gpu_mem() < (96 << 20))
++ {
++ sys->use_passthrough = true;
++ msg_Warn(filter, "Deinterlace bypassed due to lack of GPU memory");
++ }
+ }
+
+- picture = filter_NewPicture(filter);
+- if (!picture) {
+- msg_Warn(filter, "Failed to get new picture");
+- ret = -1;
+- goto out;
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_NO_QPU))
++ sys->use_qpu = false;
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_ADV))
++ {
++ sys->use_fast = false;
++ sys->use_passthrough = false;
++ }
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_FAST))
++ {
++ sys->use_fast = true;
++ sys->use_passthrough = false;
++ }
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_NONE))
++ sys->use_passthrough = true;
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_FULL_RATE))
++ sys->half_rate = false;
++ if (var_InheritBool(filter, MMAL_DEINTERLACE_HALF_RATE))
++ sys->half_rate = true;
++
++ if (sys->use_passthrough)
++ {
++ filter->pf_video_filter = pass_deinterlace;
++ filter->pf_flush = pass_flush;
++ // Don't need VCSM - get rid of it now
++ cma_vcsm_exit(sys->vcsm_init_type);
++ sys->vcsm_init_type = VCSM_INIT_NONE;
++ return 0;
++ }
++
++ {
++ char dbuf0[5], dbuf1[5];
++ msg_Dbg(filter, "%s: %s,%dx%d [(%d,%d) %d/%d] -> %s,%dx%d [(%d,%d) %dx%d]: %s %s %s", __func__,
++ str_fourcc(dbuf0, filter->fmt_in.video.i_chroma),
++ filter->fmt_in.video.i_width, filter->fmt_in.video.i_height,
++ filter->fmt_in.video.i_x_offset, filter->fmt_in.video.i_y_offset,
++ filter->fmt_in.video.i_visible_width, filter->fmt_in.video.i_visible_height,
++ str_fourcc(dbuf1, filter->fmt_out.video.i_chroma),
++ filter->fmt_out.video.i_width, filter->fmt_out.video.i_height,
++ filter->fmt_out.video.i_x_offset, filter->fmt_out.video.i_y_offset,
++ filter->fmt_out.video.i_visible_width, filter->fmt_out.video.i_visible_height,
++ sys->use_qpu ? "QPU" : "VPU",
++ sys->use_fast ? "FAST" : "ADV",
++ sys->use_passthrough ? "PASS" : sys->half_rate ? "HALF" : "FULL");
+ }
+- picture->format.i_frame_rate = filter->fmt_out.video.i_frame_rate;
+- picture->format.i_frame_rate_base = filter->fmt_out.video.i_frame_rate_base;
+
+- buffer = picture->p_sys->buffer;
+- buffer->user_data = picture;
+- buffer->cmd = 0;
++ status = mmal_component_create(MMAL_COMPONENT_DEFAULT_DEINTERLACE, &sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_DEINTERLACE, status, mmal_status_to_string(status));
++ goto fail;
++ }
+
+- mmal_picture_lock(picture);
++ {
++ const MMAL_PARAMETER_IMAGEFX_PARAMETERS_T imfx_param = {
++ { MMAL_PARAMETER_IMAGE_EFFECT_PARAMETERS, sizeof(imfx_param) },
++ sys->use_fast ?
++ MMAL_PARAM_IMAGEFX_DEINTERLACE_FAST :
++ MMAL_PARAM_IMAGEFX_DEINTERLACE_ADV,
++ 4,
++ { 5 /* Frame type: mixed */, frame_duration, sys->half_rate, sys->use_qpu }
++ };
+
+- status = mmal_port_send_buffer(sys->output, buffer);
++ status = mmal_port_parameter_set(sys->component->output[0], &imfx_param.hdr);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to configure MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_DEINTERLACE, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ }
++
++ sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
++ status = mmal_port_enable(sys->component->control, control_port_cb);
+ if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to send buffer to output port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- mmal_buffer_header_release(buffer);
+- picture_Release(picture);
+- ret = -1;
+- } else {
+- atomic_fetch_add(&sys->output_in_transit, 1);
+- vlc_sem_post(&sys->sem);
++ msg_Err(filter, "Failed to enable control port %s (status=%"PRIx32" %s)",
++ sys->component->control->name, status, mmal_status_to_string(status));
++ goto fail;
+ }
+
+-out:
+- return ret;
+-}
++ sys->input = sys->component->input[0];
++ sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
++ sys->input->format->encoding = vlc_to_mmal_video_fourcc(&filter->fmt_in.video);
++ hw_mmal_vlc_fmt_to_mmal_fmt(sys->input->format, &filter->fmt_in.video);
+
+-static void fill_output_port(filter_t *filter)
+-{
+- filter_sys_t *sys = filter->p_sys;
+- /* allow at least 2 buffers in transit */
+- unsigned max_buffers_in_transit = __MAX(2, MIN_NUM_BUFFERS_IN_TRANSIT);
+- int buffers_available = sys->output->buffer_num -
+- atomic_load(&sys->output_in_transit) -
+- mmal_queue_length(sys->filtered_pictures);
+- int buffers_to_send = max_buffers_in_transit - sys->output_in_transit;
+- int i;
++ es_format_Copy(&filter->fmt_out, &filter->fmt_in);
++ if (!sys->half_rate)
++ filter->fmt_out.video.i_frame_rate *= 2;
+
+- if (buffers_to_send > buffers_available)
+- buffers_to_send = buffers_available;
++ status = mmal_port_format_commit(sys->input);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++ sys->input->buffer_num = 30;
++// sys->input->buffer_num = sys->input->buffer_num_recommended;
+
+-#ifndef NDEBUG
+- msg_Dbg(filter, "Send %d buffers to output port (available: %d, in_transit: %d, buffer_num: %d)",
+- buffers_to_send, buffers_available, sys->output_in_transit,
+- sys->output->buffer_num);
+-#endif
+- for (i = 0; i < buffers_to_send; ++i) {
+- if (send_output_buffer(filter) < 0)
+- break;
++ if ((sys->in_pool = mmal_pool_create(sys->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(filter, "Failed to create input pool");
++ goto fail;
+ }
+-}
+
+-static picture_t *deinterlace(filter_t *filter, picture_t *picture)
+-{
+- filter_sys_t *sys = filter->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
+- picture_t *out_picture = NULL;
+- picture_t *ret = NULL;
+- MMAL_STATUS_T status;
+- unsigned i = 0;
++ status = port_parameter_set_bool(sys->input, MMAL_PARAMETER_ZERO_COPY, true);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
+
+- fill_output_port(filter);
++ status = mmal_port_enable(sys->input, di_input_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to enable input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
+
+- buffer = picture->p_sys->buffer;
+- buffer->user_data = picture;
+- buffer->pts = picture->date;
+- buffer->cmd = 0;
+
+- if (!picture->p_sys->displayed) {
+- status = mmal_port_send_buffer(sys->input, buffer);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(filter, "Failed to send buffer to input port (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- picture_Release(picture);
+- } else {
+- picture->p_sys->displayed = true;
+- atomic_fetch_add(&sys->input_in_transit, 1);
+- vlc_sem_post(&sys->sem);
+- }
+- } else {
+- picture_Release(picture);
+- }
+-
+- /*
+- * Send output buffers
+- */
+- while(atomic_load(&sys->started) && i < 2) {
+- if (buffer = mmal_queue_timedwait(sys->filtered_pictures, 2000)) {
+- i++;
+- if (!out_picture) {
+- out_picture = (picture_t *)buffer->user_data;
+- ret = out_picture;
+- } else {
+- out_picture->p_next = (picture_t *)buffer->user_data;
+- out_picture = out_picture->p_next;
+- }
+- out_picture->date = buffer->pts;
+- } else {
+- msg_Dbg(filter, "Failed waiting for filtered picture");
+- break;
+- }
++ if ((sys->out_q = mmal_queue_create()) == NULL)
++ {
++ msg_Err(filter, "Failed to create out Q");
++ goto fail;
+ }
+- if (out_picture)
+- out_picture->p_next = NULL;
+
+- return ret;
+-}
+-
+-static void flush(filter_t *filter)
+-{
+- filter_sys_t *sys = filter->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer;
++ sys->output = sys->component->output[0];
++ mmal_format_full_copy(sys->output->format, sys->input->format);
+
+- msg_Dbg(filter, "flush deinterlace filter");
++ if (!sys->is_cma)
++ {
++ if ((status = hw_mmal_opaque_output(VLC_OBJECT(filter), &sys->out_ppr, sys->output, 5, di_output_port_cb)) != MMAL_SUCCESS)
++ goto fail;
++ }
++ else
++ {
++ // CMA stuff
++ sys->output->userdata = (struct MMAL_PORT_USERDATA_T *)filter;
++
++ if ((sys->cma_out_pool = cma_buf_pool_new(8, 8, true, "deinterlace")) == NULL)
++ {
++ msg_Err(filter, "Failed to alloc cma buf pool");
++ goto fail;
++ }
+
+- msg_Dbg(filter, "flush: flush ports (input: %d, output: %d in transit)",
+- sys->input_in_transit, sys->output_in_transit);
+- mmal_port_flush(sys->output);
+- mmal_port_flush(sys->input);
+-
+- msg_Dbg(filter, "flush: wait for all buffers to be returned");
+- while (atomic_load(&sys->input_in_transit) ||
+- atomic_load(&sys->output_in_transit))
+- vlc_sem_wait(&sys->sem);
+-
+- while ((buffer = mmal_queue_get(sys->filtered_pictures))) {
+- picture_t *pic = (picture_t *)buffer->user_data;
+- msg_Dbg(filter, "flush: release already filtered pic %p",
+- (void *)pic);
+- picture_Release(pic);
+- }
+- atomic_store(&sys->started, false);
+- msg_Dbg(filter, "flush: done");
+-}
++ // Rate control done by CMA in flight logic, so have "inexhaustable" pool here
++ if ((sys->out_pool = mmal_pool_create(30, 0)) == NULL)
++ {
++ msg_Err(filter, "Failed to alloc out pool");
++ goto fail;
++ }
+
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+-{
+- filter_t *filter = (filter_t *)port->userdata;
+- MMAL_STATUS_T status;
++ port_parameter_set_bool(sys->output, MMAL_PARAMETER_ZERO_COPY, true);
+
+- if (buffer->cmd == MMAL_EVENT_ERROR) {
+- status = *(uint32_t *)buffer->data;
+- msg_Err(filter, "MMAL error %"PRIx32" \"%s\"", status,
+- mmal_status_to_string(status));
+- }
++ if ((status = mmal_port_format_commit(sys->output)) != MMAL_SUCCESS)
++ {
++ msg_Err(filter, "Output port format commit failed");
++ goto fail;
++ }
+
+- mmal_buffer_header_release(buffer);
+-}
++ sys->output->buffer_num = 30;
++ sys->output->buffer_size = sys->output->buffer_size_recommended;
+
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+-{
+- picture_t *picture = (picture_t *)buffer->user_data;
+- filter_t *filter = (filter_t *)port->userdata;
+- filter_sys_t *sys = filter->p_sys;
++ // CB just drops all bufs into out_q
++ if ((status = mmal_port_enable(sys->output, di_output_port_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(filter, "Failed to enable output port %s (status=%"PRIx32" %s)",
++ sys->output->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ }
+
+- if (picture) {
+- picture_Release(picture);
+- } else {
+- msg_Warn(filter, "Got buffer without picture on input port - OOOPS");
+- mmal_buffer_header_release(buffer);
++ status = mmal_component_enable(sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(filter, "Failed to enable component %s (status=%"PRIx32" %s)",
++ sys->component->name, status, mmal_status_to_string(status));
++ goto fail;
+ }
+
+- atomic_fetch_sub(&sys->input_in_transit, 1);
+- vlc_sem_post(&sys->sem);
++ filter->pf_video_filter = deinterlace;
++ filter->pf_flush = di_flush;
++ return 0;
++
++fail:
++ CloseMmalDeinterlace(filter);
++ return ret;
+ }
+
+-static void output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+-{
+- filter_t *filter = (filter_t *)port->userdata;
+- filter_sys_t *sys = filter->p_sys;
+- picture_t *picture;
++vlc_module_begin()
++ set_shortname(N_("MMAL deinterlace"))
++ set_description(N_("MMAL-based deinterlace filter plugin"))
++ set_capability("video filter", 900)
++ set_category(CAT_VIDEO)
++ set_subcategory(SUBCAT_VIDEO_VFILTER)
++ set_callbacks(OpenMmalDeinterlace, CloseMmalDeinterlace)
++ add_shortcut("deinterlace")
++ add_bool(MMAL_DEINTERLACE_NO_QPU, false, MMAL_DEINTERLACE_NO_QPU_TEXT,
++ MMAL_DEINTERLACE_NO_QPU_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_ADV, false, MMAL_DEINTERLACE_ADV_TEXT,
++ MMAL_DEINTERLACE_ADV_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_FAST, false, MMAL_DEINTERLACE_FAST_TEXT,
++ MMAL_DEINTERLACE_FAST_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_NONE, false, MMAL_DEINTERLACE_NONE_TEXT,
++ MMAL_DEINTERLACE_NONE_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_HALF_RATE, false, MMAL_DEINTERLACE_HALF_RATE_TEXT,
++ MMAL_DEINTERLACE_HALF_RATE_LONGTEXT, true);
++ add_bool(MMAL_DEINTERLACE_FULL_RATE, false, MMAL_DEINTERLACE_FULL_RATE_TEXT,
++ MMAL_DEINTERLACE_FULL_RATE_LONGTEXT, true);
++
++vlc_module_end()
++
+
+- if (buffer->cmd == 0) {
+- if (buffer->length > 0) {
+- atomic_store(&sys->started, true);
+- mmal_queue_put(sys->filtered_pictures, buffer);
+- picture = (picture_t *)buffer->user_data;
+- } else {
+- picture = (picture_t *)buffer->user_data;
+- picture_Release(picture);
+- }
+-
+- atomic_fetch_sub(&sys->output_in_transit, 1);
+- vlc_sem_post(&sys->sem);
+- } else if (buffer->cmd == MMAL_EVENT_FORMAT_CHANGED) {
+- msg_Warn(filter, "MMAL_EVENT_FORMAT_CHANGED seen but not handled");
+- mmal_buffer_header_release(buffer);
+- } else {
+- mmal_buffer_header_release(buffer);
+- }
+-}
+--- /dev/null
++++ b/modules/hw/mmal/mmal_avcodec.c
+@@ -0,0 +1,2175 @@
++/*****************************************************************************
++ * video.c: video decoder using the libavcodec library
++ *****************************************************************************
++ * Copyright (C) 1999-2001 VLC authors and VideoLAN
++ * $Id$
++ *
++ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
++ * Gildas Bazin <gbazin@videolan.org>
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU Lesser General Public License as published by
++ * the Free Software Foundation; either version 2.1 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public License
++ * along with this program; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
++ *****************************************************************************/
++
++/*****************************************************************************
++ * Preamble
++ *****************************************************************************/
++#include "config.h"
++
++#include <vlc_common.h>
++#include <vlc_codec.h>
++#include <vlc_avcodec.h>
++#include <vlc_cpu.h>
++#include <vlc_atomic.h>
++#include <assert.h>
++
++#include <libavcodec/avcodec.h>
++#include <libavutil/mem.h>
++#include <libavutil/pixdesc.h>
++#if (LIBAVUTIL_VERSION_MICRO >= 100 && LIBAVUTIL_VERSION_INT >= AV_VERSION_INT( 55, 16, 101 ) )
++#include <libavutil/mastering_display_metadata.h>
++#endif
++
++//#include "avcodec.h"
++//#include "va.h"
++
++#include <vlc_plugin.h>
++#include <libavutil/rpi_sand_fns.h>
++#include <libavcodec/rpi_zc.h>
++#include "../../codec/cc.h"
++#include "../../codec/avcodec/avcommon.h" // ??? Beware over inclusion
++#include "mmal_cma.h"
++#include "mmal_picture.h"
++
++#define TRACE_ALL 0
++
++#define BUFFERS_IN_FLIGHT 5 // Default max value for in flight buffers
++#define BUFFERS_IN_FLIGHT_UHD 3 // Fewer if very big
++
++#define MMAL_AVCODEC_BUFFERS "mmal-avcodec-buffers"
++#define MMAL_AVCODEC_BUFFERS_TEXT N_("In flight buffer count before blocking.")
++#define MMAL_AVCODEC_BUFFERS_LONGTEXT N_("In flight buffer count before blocking. " \
++"Beware that incautious changing of this can lead to lockup. " \
++"Zero will disable the module.")
++
++
++// Fwd declarations required due to wanting to avoid reworking the original
++// code too much
++static void MmalAvcodecCloseDecoder( vlc_object_t *obj );
++
++
++/*****************************************************************************
++ * decoder_sys_t : decoder descriptor
++ *****************************************************************************/
++struct decoder_sys_t
++{
++ AVCodecContext *p_context;
++ const AVCodec *p_codec;
++
++ /* Video decoder specific part */
++ date_t pts;
++
++ /* Closed captions for decoders */
++ cc_data_t cc;
++
++ /* for frame skipping algo */
++ bool b_hurry_up;
++ bool b_show_corrupted;
++ bool b_from_preroll;
++ enum AVDiscard i_skip_frame;
++
++ /* how many decoded frames are late */
++ int i_late_frames;
++ mtime_t i_late_frames_start;
++ mtime_t i_last_late_delay;
++
++ /* for direct rendering */
++ bool b_direct_rendering;
++ atomic_bool b_dr_failure;
++
++ /* Hack to force display of still pictures */
++ bool b_first_frame;
++
++
++ /* */
++ bool palette_sent;
++
++ /* VA API */
++// vlc_va_t *p_va;
++ enum PixelFormat pix_fmt;
++ int profile;
++ int level;
++
++ vlc_sem_t sem_mt;
++
++ // Rpi vars
++ cma_buf_pool_t * cma_pool;
++ bool pool_alloc_1;
++ vcsm_init_type_t vcsm_init_type;
++ int cma_in_flight_max;
++ // Debug
++ decoder_t * p_dec;
++};
++
++
++static vlc_fourcc_t
++ZcFindVlcChroma(const int i_ffmpeg_chroma)
++{
++ switch (i_ffmpeg_chroma)
++ {
++ // This is all we claim to deal with
++ // In theory RGB should be doable within our current framework
++ case AV_PIX_FMT_YUV420P:
++ return VLC_CODEC_MMAL_ZC_I420;
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ return VLC_CODEC_MMAL_ZC_SAND8;
++ case AV_PIX_FMT_SAND64_10:
++ return VLC_CODEC_MMAL_ZC_SAND10;
++ case AV_PIX_FMT_RPI4_10:
++ return VLC_CODEC_MMAL_ZC_SAND30;
++ default:
++ break;
++ }
++ return 0;
++}
++
++// Pix Fmt conv for MMal
++// video_fromat from ffmpeg pic_fmt
++static int
++ZcGetVlcChroma( video_format_t *fmt, int i_ffmpeg_chroma )
++{
++ fmt->i_rmask = 0;
++ fmt->i_gmask = 0;
++ fmt->i_bmask = 0;
++ fmt->i_chroma = ZcFindVlcChroma(i_ffmpeg_chroma);
++
++ return fmt->i_chroma == 0 ? -1 : 0;
++}
++
++
++// Format chooser is way simpler than vlc
++static enum PixelFormat
++ZcGetFormat(AVCodecContext *p_context, const enum PixelFormat *pi_fmt)
++{
++ enum PixelFormat swfmt = avcodec_default_get_format(p_context, pi_fmt);
++ for (size_t i = 0; pi_fmt[i] != AV_PIX_FMT_NONE; i++)
++ {
++ if (ZcFindVlcChroma(pi_fmt[i]) != 0)
++ return pi_fmt[i];
++ }
++ return swfmt;
++}
++
++
++static void cma_avbuf_pool_free(void * v)
++{
++ cma_buf_unref(v);
++}
++
++static unsigned int zc_buf_vcsm_handle(void * v)
++{
++ return cma_buf_vcsm_handle(v);
++}
++
++static unsigned int zc_buf_vc_handle(void * v)
++{
++ return cma_buf_vc_handle(v);
++}
++
++static void * zc_buf_map_arm(void * v)
++{
++ return cma_buf_addr(v);
++}
++
++static unsigned int zc_buf_map_vc(void * v)
++{
++ return cma_buf_vc_addr(v);
++}
++
++
++
++static const av_rpi_zc_buf_fn_tab_t zc_buf_fn_tab = {
++ .free = cma_avbuf_pool_free,
++
++ .vcsm_handle = zc_buf_vcsm_handle,
++ .vc_handle = zc_buf_vc_handle,
++ .map_arm = zc_buf_map_arm,
++ .map_vc = zc_buf_map_vc
++};
++
++
++static AVBufferRef *
++zc_alloc_buf(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
++{
++ decoder_t * const dec = v;
++ decoder_sys_t * const sys = dec->p_sys;
++
++ VLC_UNUSED(geo);
++
++ assert(sys != NULL);
++
++ const unsigned int dec_pool_req = av_rpi_zc_get_decoder_pool_size(sys->p_context->opaque);
++ if (dec_pool_req != 0)
++ {
++ cma_buf_pool_resize(sys->cma_pool, dec_pool_req + sys->cma_in_flight_max, sys->cma_in_flight_max);
++
++ if (!sys->pool_alloc_1)
++ {
++ sys->pool_alloc_1 = true;
++ msg_Dbg(dec, "Pool size: (%d+%d) * %zd", dec_pool_req, sys->cma_in_flight_max, size);
++ if (cma_buf_pool_fill(sys->cma_pool, size) != 0)
++ msg_Warn(dec, "Failed to preallocate decoder pool (%d+%d) * %zd", dec_pool_req, sys->cma_in_flight_max, size);
++ }
++ }
++
++ void * const cmabuf = cma_buf_pool_alloc_buf(sys->cma_pool, size);
++
++ if (cmabuf == NULL)
++ {
++ msg_Err(dec, "CMA buf pool alloc buf failed");
++ return NULL;
++ }
++
++ AVBufferRef *const avbuf = av_rpi_zc_buf(cma_buf_size(cmabuf), 0, cmabuf, &zc_buf_fn_tab);
++
++ if (avbuf == NULL)
++ {
++ msg_Err(dec, "av_rpi_zc_buf failed");
++ cma_buf_unref(cmabuf);
++ return NULL;
++ }
++
++ return avbuf;
++}
++
++static void
++zc_free_pool(void * v)
++{
++ decoder_t * const dec = v;
++ cma_buf_pool_delete(dec->p_sys->cma_pool);
++}
++
++
++static const uint8_t shift_01[] = {0,1,1,1};
++static const uint8_t pb_1[] = {1,1,1,1};
++static const uint8_t pb_12[] = {1,2,2,2};
++static const uint8_t pb_24[] = {2,4,4,4};
++static const uint8_t pb_4[] = {4,4,4,4};
++
++static int set_pic_from_frame(picture_t * const pic, const AVFrame * const frame)
++{
++ const uint8_t * hs = shift_01;
++ const uint8_t * ws = shift_01;
++ const uint8_t * pb = pb_1;
++
++ switch (pic->format.i_chroma)
++ {
++ case VLC_CODEC_MMAL_ZC_RGB32:
++ pic->i_planes = 1;
++ pb = pb_4;
++ break;
++ case VLC_CODEC_MMAL_ZC_I420:
++ pic->i_planes = 3;
++ break;
++ case VLC_CODEC_MMAL_ZC_SAND8:
++ pic->i_planes = 2;
++ pb = pb_12;
++ break;
++ case VLC_CODEC_MMAL_ZC_SAND10:
++ case VLC_CODEC_MMAL_ZC_SAND30: // Lies: SAND30 is "special"
++ pic->i_planes = 2;
++ pb = pb_24;
++ break;
++ default:
++ return VLC_EGENERIC;
++ }
++
++ const cma_buf_t * const cb = cma_buf_pic_get(pic);
++ uint8_t * const data = cma_buf_addr(cb);
++ if (data == NULL) {
++ return VLC_ENOMEM;
++ }
++
++ uint8_t * frame_end = frame->data[0] + cma_buf_size(cb);
++ for (int i = 0; i != pic->i_planes; ++i) {
++ // Calculate lines from gap between planes
++ // This will give us an accurate "height" for later use by MMAL
++ const int lines = ((i + 1 == pic->i_planes ? frame_end : frame->data[i + 1]) -
++ frame->data[i]) / frame->linesize[i];
++ pic->p[i] = (plane_t){
++ .p_pixels = data + (frame->data[i] - frame->data[0]),
++ .i_lines = lines,
++ .i_pitch = frame->linesize[i],
++ .i_pixel_pitch = pb[i],
++ .i_visible_lines = av_frame_cropped_height(frame) >> hs[i],
++ .i_visible_pitch = av_frame_cropped_width(frame) >> ws[i]
++ };
++ }
++ return 0;
++}
++
++
++//============================================================================
++//
++// Nicked from avcodec/fourcc.c
++//
++// * Really we should probably use that directly
++
++/*
++ * Video Codecs
++ */
++
++struct vlc_avcodec_fourcc
++{
++ vlc_fourcc_t i_fourcc;
++ unsigned i_codec;
++};
++
++
++static const struct vlc_avcodec_fourcc video_codecs[] =
++{
++ { VLC_CODEC_MP1V, AV_CODEC_ID_MPEG1VIDEO },
++ { VLC_CODEC_MP2V, AV_CODEC_ID_MPEG2VIDEO }, /* prefer MPEG2 over MPEG1 */
++ { VLC_CODEC_MPGV, AV_CODEC_ID_MPEG2VIDEO }, /* prefer MPEG2 over MPEG1 */
++ /* AV_CODEC_ID_MPEG2VIDEO_XVMC */
++ { VLC_CODEC_H261, AV_CODEC_ID_H261 },
++ { VLC_CODEC_H263, AV_CODEC_ID_H263 },
++ { VLC_CODEC_RV10, AV_CODEC_ID_RV10 },
++ { VLC_CODEC_RV13, AV_CODEC_ID_RV10 },
++ { VLC_CODEC_RV20, AV_CODEC_ID_RV20 },
++ { VLC_CODEC_MJPG, AV_CODEC_ID_MJPEG },
++ { VLC_CODEC_MJPGB, AV_CODEC_ID_MJPEGB },
++ { VLC_CODEC_LJPG, AV_CODEC_ID_LJPEG },
++ { VLC_CODEC_SP5X, AV_CODEC_ID_SP5X },
++ { VLC_CODEC_JPEGLS, AV_CODEC_ID_JPEGLS },
++ { VLC_CODEC_MP4V, AV_CODEC_ID_MPEG4 },
++ /* AV_CODEC_ID_RAWVIDEO */
++ { VLC_CODEC_DIV1, AV_CODEC_ID_MSMPEG4V1 },
++ { VLC_CODEC_DIV2, AV_CODEC_ID_MSMPEG4V2 },
++ { VLC_CODEC_DIV3, AV_CODEC_ID_MSMPEG4V3 },
++ { VLC_CODEC_WMV1, AV_CODEC_ID_WMV1 },
++ { VLC_CODEC_WMV2, AV_CODEC_ID_WMV2 },
++ { VLC_CODEC_H263P, AV_CODEC_ID_H263P },
++ { VLC_CODEC_H263I, AV_CODEC_ID_H263I },
++ { VLC_CODEC_FLV1, AV_CODEC_ID_FLV1 },
++ { VLC_CODEC_SVQ1, AV_CODEC_ID_SVQ1 },
++ { VLC_CODEC_SVQ3, AV_CODEC_ID_SVQ3 },
++ { VLC_CODEC_DV, AV_CODEC_ID_DVVIDEO },
++ { VLC_CODEC_HUFFYUV, AV_CODEC_ID_HUFFYUV },
++ { VLC_CODEC_CYUV, AV_CODEC_ID_CYUV },
++ { VLC_CODEC_H264, AV_CODEC_ID_H264 },
++ { VLC_CODEC_INDEO3, AV_CODEC_ID_INDEO3 },
++ { VLC_CODEC_VP3, AV_CODEC_ID_VP3 },
++ { VLC_CODEC_THEORA, AV_CODEC_ID_THEORA },
++#if ( !defined( WORDS_BIGENDIAN ) )
++ /* Asus Video (Another thing that doesn't work on PPC) */
++ { VLC_CODEC_ASV1, AV_CODEC_ID_ASV1 },
++ { VLC_CODEC_ASV2, AV_CODEC_ID_ASV2 },
++#endif
++ { VLC_CODEC_FFV1, AV_CODEC_ID_FFV1 },
++ { VLC_CODEC_4XM, AV_CODEC_ID_4XM },
++ { VLC_CODEC_VCR1, AV_CODEC_ID_VCR1 },
++ { VLC_CODEC_CLJR, AV_CODEC_ID_CLJR },
++ { VLC_CODEC_MDEC, AV_CODEC_ID_MDEC },
++ { VLC_CODEC_ROQ, AV_CODEC_ID_ROQ },
++ { VLC_CODEC_INTERPLAY, AV_CODEC_ID_INTERPLAY_VIDEO },
++ { VLC_CODEC_XAN_WC3, AV_CODEC_ID_XAN_WC3 },
++ { VLC_CODEC_XAN_WC4, AV_CODEC_ID_XAN_WC4 },
++ { VLC_CODEC_RPZA, AV_CODEC_ID_RPZA },
++ { VLC_CODEC_CINEPAK, AV_CODEC_ID_CINEPAK },
++ { VLC_CODEC_WS_VQA, AV_CODEC_ID_WS_VQA },
++ { VLC_CODEC_MSRLE, AV_CODEC_ID_MSRLE },
++ { VLC_CODEC_MSVIDEO1, AV_CODEC_ID_MSVIDEO1 },
++ { VLC_CODEC_IDCIN, AV_CODEC_ID_IDCIN },
++ { VLC_CODEC_8BPS, AV_CODEC_ID_8BPS },
++ { VLC_CODEC_SMC, AV_CODEC_ID_SMC },
++ { VLC_CODEC_FLIC, AV_CODEC_ID_FLIC },
++ { VLC_CODEC_TRUEMOTION1, AV_CODEC_ID_TRUEMOTION1 },
++ { VLC_CODEC_VMDVIDEO, AV_CODEC_ID_VMDVIDEO },
++ { VLC_CODEC_LCL_MSZH, AV_CODEC_ID_MSZH },
++ { VLC_CODEC_LCL_ZLIB, AV_CODEC_ID_ZLIB },
++ { VLC_CODEC_QTRLE, AV_CODEC_ID_QTRLE },
++ { VLC_CODEC_TSCC, AV_CODEC_ID_TSCC },
++ { VLC_CODEC_ULTI, AV_CODEC_ID_ULTI },
++ { VLC_CODEC_QDRAW, AV_CODEC_ID_QDRAW },
++ { VLC_CODEC_VIXL, AV_CODEC_ID_VIXL },
++ { VLC_CODEC_QPEG, AV_CODEC_ID_QPEG },
++ { VLC_CODEC_PNG, AV_CODEC_ID_PNG },
++ { VLC_CODEC_PPM, AV_CODEC_ID_PPM },
++ /* AV_CODEC_ID_PBM */
++ { VLC_CODEC_PGM, AV_CODEC_ID_PGM },
++ { VLC_CODEC_PGMYUV, AV_CODEC_ID_PGMYUV },
++ { VLC_CODEC_PAM, AV_CODEC_ID_PAM },
++ { VLC_CODEC_FFVHUFF, AV_CODEC_ID_FFVHUFF },
++ { VLC_CODEC_RV30, AV_CODEC_ID_RV30 },
++ { VLC_CODEC_RV40, AV_CODEC_ID_RV40 },
++ { VLC_CODEC_VC1, AV_CODEC_ID_VC1 },
++ { VLC_CODEC_WMVA, AV_CODEC_ID_VC1 },
++ { VLC_CODEC_WMV3, AV_CODEC_ID_WMV3 },
++ { VLC_CODEC_WMVP, AV_CODEC_ID_WMV3 },
++ { VLC_CODEC_LOCO, AV_CODEC_ID_LOCO },
++ { VLC_CODEC_WNV1, AV_CODEC_ID_WNV1 },
++ { VLC_CODEC_AASC, AV_CODEC_ID_AASC },
++ { VLC_CODEC_INDEO2, AV_CODEC_ID_INDEO2 },
++ { VLC_CODEC_FRAPS, AV_CODEC_ID_FRAPS },
++ { VLC_CODEC_TRUEMOTION2, AV_CODEC_ID_TRUEMOTION2 },
++ { VLC_CODEC_BMP, AV_CODEC_ID_BMP },
++ { VLC_CODEC_CSCD, AV_CODEC_ID_CSCD },
++ { VLC_CODEC_MMVIDEO, AV_CODEC_ID_MMVIDEO },
++ { VLC_CODEC_ZMBV, AV_CODEC_ID_ZMBV },
++ { VLC_CODEC_AVS, AV_CODEC_ID_AVS },
++ { VLC_CODEC_SMACKVIDEO, AV_CODEC_ID_SMACKVIDEO },
++ { VLC_CODEC_NUV, AV_CODEC_ID_NUV },
++ { VLC_CODEC_KMVC, AV_CODEC_ID_KMVC },
++ { VLC_CODEC_FLASHSV, AV_CODEC_ID_FLASHSV },
++ { VLC_CODEC_CAVS, AV_CODEC_ID_CAVS },
++ { VLC_CODEC_JPEG2000, AV_CODEC_ID_JPEG2000 },
++ { VLC_CODEC_VMNC, AV_CODEC_ID_VMNC },
++ { VLC_CODEC_VP5, AV_CODEC_ID_VP5 },
++ { VLC_CODEC_VP6, AV_CODEC_ID_VP6 },
++ { VLC_CODEC_VP6F, AV_CODEC_ID_VP6F },
++ { VLC_CODEC_TARGA, AV_CODEC_ID_TARGA },
++ { VLC_CODEC_DSICINVIDEO, AV_CODEC_ID_DSICINVIDEO },
++ { VLC_CODEC_TIERTEXSEQVIDEO, AV_CODEC_ID_TIERTEXSEQVIDEO },
++ { VLC_CODEC_TIFF, AV_CODEC_ID_TIFF },
++ { VLC_CODEC_GIF, AV_CODEC_ID_GIF },
++ { VLC_CODEC_DXA, AV_CODEC_ID_DXA },
++ { VLC_CODEC_DNXHD, AV_CODEC_ID_DNXHD },
++ { VLC_CODEC_THP, AV_CODEC_ID_THP },
++ { VLC_CODEC_SGI, AV_CODEC_ID_SGI },
++ { VLC_CODEC_C93, AV_CODEC_ID_C93 },
++ { VLC_CODEC_BETHSOFTVID, AV_CODEC_ID_BETHSOFTVID },
++ /* AV_CODEC_ID_PTX */
++ { VLC_CODEC_TXD, AV_CODEC_ID_TXD },
++ { VLC_CODEC_VP6A, AV_CODEC_ID_VP6A },
++ { VLC_CODEC_AMV, AV_CODEC_ID_AMV },
++ { VLC_CODEC_VB, AV_CODEC_ID_VB },
++ { VLC_CODEC_PCX, AV_CODEC_ID_PCX },
++ /* AV_CODEC_ID_SUNRAST */
++ { VLC_CODEC_INDEO4, AV_CODEC_ID_INDEO4 },
++ { VLC_CODEC_INDEO5, AV_CODEC_ID_INDEO5 },
++ { VLC_CODEC_MIMIC, AV_CODEC_ID_MIMIC },
++ { VLC_CODEC_RL2, AV_CODEC_ID_RL2 },
++ { VLC_CODEC_ESCAPE124, AV_CODEC_ID_ESCAPE124 },
++ { VLC_CODEC_DIRAC, AV_CODEC_ID_DIRAC },
++ { VLC_CODEC_BFI, AV_CODEC_ID_BFI },
++ { VLC_CODEC_CMV, AV_CODEC_ID_CMV },
++ { VLC_CODEC_MOTIONPIXELS, AV_CODEC_ID_MOTIONPIXELS },
++ { VLC_CODEC_TGV, AV_CODEC_ID_TGV },
++ { VLC_CODEC_TGQ, AV_CODEC_ID_TGQ },
++ { VLC_CODEC_TQI, AV_CODEC_ID_TQI },
++ { VLC_CODEC_AURA, AV_CODEC_ID_AURA },
++ /* AV_CODEC_ID_AURA2 */
++ /* AV_CODEC_ID_V210X */
++ { VLC_CODEC_TMV, AV_CODEC_ID_TMV },
++ { VLC_CODEC_V210, AV_CODEC_ID_V210 },
++#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT( 54, 50, 100 ) && LIBAVCODEC_VERSION_MICRO >= 100
++ { VLC_CODEC_VUYA, AV_CODEC_ID_AYUV },
++#endif
++ /* AV_CODEC_ID_DPX */
++ { VLC_CODEC_MAD, AV_CODEC_ID_MAD },
++ { VLC_CODEC_FRWU, AV_CODEC_ID_FRWU },
++ { VLC_CODEC_FLASHSV2, AV_CODEC_ID_FLASHSV2 },
++ /* AV_CODEC_ID_CDGRAPHICS */
++ /* AV_CODEC_ID_R210 */
++ { VLC_CODEC_ANM, AV_CODEC_ID_ANM },
++ { VLC_CODEC_BINKVIDEO, AV_CODEC_ID_BINKVIDEO },
++ /* AV_CODEC_ID_IFF_ILBM */
++ /* AV_CODEC_ID_IFF_BYTERUN1 */
++ { VLC_CODEC_KGV1, AV_CODEC_ID_KGV1 },
++ { VLC_CODEC_YOP, AV_CODEC_ID_YOP },
++ { VLC_CODEC_VP8, AV_CODEC_ID_VP8 },
++ /* AV_CODEC_ID_PICTOR */
++ /* AV_CODEC_ID_ANSI */
++ /* AV_CODEC_ID_A64_MULTI */
++ /* AV_CODEC_ID_A64_MULTI5 */
++ /* AV_CODEC_ID_R10K */
++ { VLC_CODEC_MXPEG, AV_CODEC_ID_MXPEG },
++ { VLC_CODEC_LAGARITH, AV_CODEC_ID_LAGARITH },
++ { VLC_CODEC_PRORES, AV_CODEC_ID_PRORES },
++ { VLC_CODEC_JV, AV_CODEC_ID_JV },
++ { VLC_CODEC_DFA, AV_CODEC_ID_DFA },
++ { VLC_CODEC_WMVP, AV_CODEC_ID_WMV3IMAGE },
++ { VLC_CODEC_WMVP2, AV_CODEC_ID_VC1IMAGE },
++ { VLC_CODEC_UTVIDEO, AV_CODEC_ID_UTVIDEO },
++ { VLC_CODEC_BMVVIDEO, AV_CODEC_ID_BMV_VIDEO },
++ { VLC_CODEC_VBLE, AV_CODEC_ID_VBLE },
++ { VLC_CODEC_DXTORY, AV_CODEC_ID_DXTORY },
++ /* AV_CODEC_ID_V410 */
++ /* AV_CODEC_ID_XWD */
++ { VLC_CODEC_CDXL, AV_CODEC_ID_CDXL },
++ /* AV_CODEC_ID_XBM */
++ /* AV_CODEC_ID_ZEROCODEC */
++ { VLC_CODEC_MSS1, AV_CODEC_ID_MSS1 },
++ { VLC_CODEC_MSA1, AV_CODEC_ID_MSA1 },
++ { VLC_CODEC_TSC2, AV_CODEC_ID_TSCC2 },
++ { VLC_CODEC_MTS2, AV_CODEC_ID_MTS2 },
++ { VLC_CODEC_CLLC, AV_CODEC_ID_CLLC },
++ { VLC_CODEC_MSS2, AV_CODEC_ID_MSS2 },
++ { VLC_CODEC_VP9, AV_CODEC_ID_VP9 },
++#if LIBAVCODEC_VERSION_CHECK( 57, 26, 0, 83, 101 )
++ { VLC_CODEC_AV1, AV_CODEC_ID_AV1 },
++#endif
++ { VLC_CODEC_ICOD, AV_CODEC_ID_AIC },
++ /* AV_CODEC_ID_ESCAPE130 */
++ { VLC_CODEC_G2M4, AV_CODEC_ID_G2M },
++ { VLC_CODEC_G2M2, AV_CODEC_ID_G2M },
++ { VLC_CODEC_G2M3, AV_CODEC_ID_G2M },
++ /* AV_CODEC_ID_WEBP */
++ { VLC_CODEC_HNM4_VIDEO, AV_CODEC_ID_HNM4_VIDEO },
++ { VLC_CODEC_HEVC, AV_CODEC_ID_HEVC },
++
++ { VLC_CODEC_FIC , AV_CODEC_ID_FIC },
++ /* AV_CODEC_ID_ALIAS_PIX */
++ /* AV_CODEC_ID_BRENDER_PIX */
++ /* AV_CODEC_ID_PAF_VIDEO */
++ /* AV_CODEC_ID_EXR */
++
++ { VLC_CODEC_VP7 , AV_CODEC_ID_VP7 },
++ /* AV_CODEC_ID_SANM */
++ /* AV_CODEC_ID_SGIRLE */
++ /* AV_CODEC_ID_MVC1 */
++ /* AV_CODEC_ID_MVC2 */
++ { VLC_CODEC_HQX, AV_CODEC_ID_HQX },
++
++ { VLC_CODEC_TDSC, AV_CODEC_ID_TDSC },
++
++ { VLC_CODEC_HQ_HQA, AV_CODEC_ID_HQ_HQA },
++
++ { VLC_CODEC_HAP, AV_CODEC_ID_HAP },
++ /* AV_CODEC_ID_DDS */
++
++ { VLC_CODEC_DXV, AV_CODEC_ID_DXV },
++
++ /* ffmpeg only: AV_CODEC_ID_BRENDER_PIX */
++ /* ffmpeg only: AV_CODEC_ID_Y41P */
++ /* ffmpeg only: AV_CODEC_ID_EXR */
++ /* ffmpeg only: AV_CODEC_ID_AVRP */
++ /* ffmpeg only: AV_CODEC_ID_012V */
++ /* ffmpeg only: AV_CODEC_ID_AVUI */
++ /* ffmpeg only: AV_CODEC_ID_TARGA_Y216 */
++ /* ffmpeg only: AV_CODEC_ID_V308 */
++ /* ffmpeg only: AV_CODEC_ID_V408 */
++ /* ffmpeg only: AV_CODEC_ID_YUV4 */
++ /* ffmpeg only: AV_CODEC_ID_SANM */
++ /* ffmpeg only: AV_CODEC_ID_PAF_VIDEO */
++ /* ffmpeg only: AV_CODEC_ID_AVRN */
++ /* ffmpeg only: AV_CODEC_ID_CPIA */
++ /* ffmpeg only: AV_CODEC_ID_XFACE */
++ /* ffmpeg only: AV_CODEC_ID_SGIRLE */
++ /* ffmpeg only: AV_CODEC_ID_MVC1 */
++ /* ffmpeg only: AV_CODEC_ID_MVC2 */
++ /* ffmpeg only: AV_CODEC_ID_SNOW */
++ /* ffmpeg only: AV_CODEC_ID_SMVJPEG */
++
++#if LIBAVCODEC_VERSION_CHECK( 57, 999, 999, 24, 102 )
++ { VLC_CODEC_CINEFORM, AV_CODEC_ID_CFHD },
++#endif
++
++#if LIBAVCODEC_VERSION_CHECK( 57, 999, 999, 70, 100 )
++ { VLC_CODEC_PIXLET, AV_CODEC_ID_PIXLET },
++#endif
++
++#if LIBAVCODEC_VERSION_CHECK( 57, 999, 999, 71, 101 )
++ { VLC_CODEC_SPEEDHQ, AV_CODEC_ID_SPEEDHQ },
++#endif
++
++#if LIBAVCODEC_VERSION_CHECK( 57, 999, 999, 79, 100 )
++ { VLC_CODEC_FMVC, AV_CODEC_ID_FMVC },
++#endif
++};
++
++// *** Really we should probably use GetFfmpegCodec with a pre-kludge for the bits we care about
++static bool
++ZcGetFfmpegCodec( enum es_format_category_e cat, vlc_fourcc_t i_fourcc,
++ unsigned *pi_ffmpeg_codec, const char **ppsz_name )
++{
++ const struct vlc_avcodec_fourcc *base;
++ size_t count;
++
++ base = video_codecs;
++ count = ARRAY_SIZE(video_codecs);
++ i_fourcc = vlc_fourcc_GetCodec( cat, i_fourcc );
++
++ for( size_t i = 0; i < count; i++ )
++ {
++ if( base[i].i_fourcc == i_fourcc )
++ {
++ if( pi_ffmpeg_codec != NULL )
++ *pi_ffmpeg_codec = base[i].i_codec;
++ if( ppsz_name )
++ *ppsz_name = vlc_fourcc_GetDescription( cat, i_fourcc );
++ return true;
++ }
++ }
++ return false;
++}
++
++
++
++//============================================================================
++// Derived from codec/avcodec/avcodec.c
++
++static AVCodecContext *
++ZcFfmpeg_AllocContext( decoder_t *p_dec,
++ const AVCodec **restrict codecp )
++{
++ unsigned i_codec_id;
++ const char *psz_namecodec;
++ const AVCodec *p_codec = NULL;
++
++ /* *** determine codec type *** */
++ if( !ZcGetFfmpegCodec( p_dec->fmt_in.i_cat, p_dec->fmt_in.i_codec,
++ &i_codec_id, &psz_namecodec ) )
++ return NULL;
++
++ msg_Dbg( p_dec, "using %s %s", AVPROVIDER(LIBAVCODEC), LIBAVCODEC_IDENT );
++
++ /* Initialization must be done before avcodec_find_decoder() */
++ vlc_init_avcodec(VLC_OBJECT(p_dec));
++
++ /* *** ask ffmpeg for a decoder *** */
++ char *psz_decoder = var_InheritString( p_dec, "avcodec-codec" );
++ if( psz_decoder != NULL )
++ {
++ p_codec = avcodec_find_decoder_by_name( psz_decoder );
++ if( !p_codec )
++ msg_Err( p_dec, "Decoder `%s' not found", psz_decoder );
++ else if( p_codec->id != i_codec_id )
++ {
++ msg_Err( p_dec, "Decoder `%s' can't handle %4.4s",
++ psz_decoder, (char*)&p_dec->fmt_in.i_codec );
++ p_codec = NULL;
++ }
++ free( psz_decoder );
++ }
++ if( !p_codec )
++// p_codec = avcodec_find_decoder( i_codec_id );
++ {
++ if( p_dec->fmt_in.i_codec != VLC_CODEC_HEVC )
++ p_codec = avcodec_find_decoder(i_codec_id);
++ else
++ {
++ psz_namecodec = rpi_is_model_pi4() ? "hevc" : "hevc_rpi";
++ msg_Info(p_dec, "Looking for HEVC decoder '%s'", psz_namecodec);
++ p_codec = avcodec_find_decoder_by_name(psz_namecodec);
++ }
++ }
++
++ if( !p_codec )
++ {
++ msg_Dbg( p_dec, "codec not found (%s)", psz_namecodec );
++ return NULL;
++ }
++
++ *codecp = p_codec;
++
++ /* *** get a p_context *** */
++ AVCodecContext *avctx = avcodec_alloc_context3(p_codec);
++ if( unlikely(avctx == NULL) )
++ return NULL;
++
++ avctx->debug = var_InheritInteger( p_dec, "avcodec-debug" );
++ avctx->opaque = p_dec;
++ return avctx;
++}
++
++/*****************************************************************************
++ * ffmpeg_OpenCodec:
++ *****************************************************************************/
++
++static int
++ZcFfmpeg_OpenCodec( decoder_t *p_dec, AVCodecContext *ctx,
++ const AVCodec *codec )
++{
++ char *psz_opts = var_InheritString( p_dec, "avcodec-options" );
++ AVDictionary *options = NULL;
++ int ret;
++
++ if (psz_opts) {
++ vlc_av_get_options(psz_opts, &options);
++ free(psz_opts);
++ }
++
++ if (av_rpi_zc_init2(ctx, p_dec, zc_alloc_buf, zc_free_pool) != 0)
++ {
++ msg_Err(p_dec, "Failed to init AV ZC");
++ return VLC_EGENERIC;
++ }
++
++ vlc_avcodec_lock();
++ ret = avcodec_open2( ctx, codec, options ? &options : NULL );
++ vlc_avcodec_unlock();
++
++ AVDictionaryEntry *t = NULL;
++ while ((t = av_dict_get(options, "", t, AV_DICT_IGNORE_SUFFIX))) {
++ msg_Err( p_dec, "Unknown option \"%s\"", t->key );
++ }
++ av_dict_free(&options);
++
++ if( ret < 0 )
++ {
++ msg_Err( p_dec, "cannot start codec (%s)", codec->name );
++ return VLC_EGENERIC;
++ }
++
++ msg_Dbg( p_dec, "codec (%s) started", codec->name );
++ return VLC_SUCCESS;
++}
++
++//============================================================================
++// Derived from 3.0.7.1 codec/avcodec/video.c
++
++static inline void wait_mt(decoder_sys_t *sys)
++{
++#if 1
++ // As we only ever update the output in our main thread this lock is
++ // redundant
++ VLC_UNUSED(sys);
++#else
++ vlc_sem_wait(&sys->sem_mt);
++#endif
++}
++
++static inline void post_mt(decoder_sys_t *sys)
++{
++#if 1
++ // As we only ever update the output in our main thread this lock is
++ // redundant
++ VLC_UNUSED(sys);
++#else
++ vlc_sem_post(&sys->sem_mt);
++#endif
++}
++
++/*****************************************************************************
++ * Local prototypes
++ *****************************************************************************/
++static void ffmpeg_InitCodec ( decoder_t * );
++static int DecodeVideo( decoder_t *, block_t * );
++static void Flush( decoder_t * );
++
++static uint32_t ffmpeg_CodecTag( vlc_fourcc_t fcc )
++{
++ uint8_t *p = (uint8_t*)&fcc;
++ return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
++}
++
++/*****************************************************************************
++ * Local Functions
++ *****************************************************************************/
++
++/**
++ * Sets the decoder output format.
++ */
++static int lavc_GetVideoFormat(decoder_t *dec, video_format_t *restrict fmt,
++ AVCodecContext *ctx, enum AVPixelFormat pix_fmt,
++ enum AVPixelFormat sw_pix_fmt)
++{
++ int width = ctx->coded_width;
++ int height = ctx->coded_height;
++
++ video_format_Init(fmt, 0);
++
++#if 1
++ VLC_UNUSED(sw_pix_fmt);
++ if ((fmt->i_chroma = ZcFindVlcChroma(pix_fmt)) == 0)
++ return -1;
++#else
++ if (pix_fmt == sw_pix_fmt)
++ { /* software decoding */
++ int aligns[AV_NUM_DATA_POINTERS];
++
++ if (GetVlcChroma(fmt, pix_fmt))
++ return -1;
++
++ /* The libavcodec palette can only be fetched when the first output
++ * frame is decoded. Assume that the current chroma is RGB32 while we
++ * are waiting for a valid palette. Indeed, fmt_out.video.p_palette
++ * doesn't trigger a new vout request, but a new chroma yes. */
++ if (pix_fmt == AV_PIX_FMT_PAL8 && !dec->fmt_out.video.p_palette)
++ fmt->i_chroma = VLC_CODEC_RGB32;
++
++ avcodec_align_dimensions2(ctx, &width, &height, aligns);
++ }
++ else /* hardware decoding */
++ fmt->i_chroma = vlc_va_GetChroma(pix_fmt, sw_pix_fmt);
++#endif
++
++ if( width == 0 || height == 0 || width > 8192 || height > 8192 ||
++ width < ctx->width || height < ctx->height )
++ {
++ msg_Err(dec, "Invalid frame size %dx%d vsz %dx%d",
++ width, height, ctx->width, ctx->height );
++ return -1; /* invalid display size */
++ }
++
++ fmt->i_width = width;
++ fmt->i_height = height;
++ fmt->i_visible_width = ctx->width;
++ fmt->i_visible_height = ctx->height;
++
++ /* If an aspect-ratio was specified in the input format then force it */
++ if (dec->fmt_in.video.i_sar_num > 0 && dec->fmt_in.video.i_sar_den > 0)
++ {
++ fmt->i_sar_num = dec->fmt_in.video.i_sar_num;
++ fmt->i_sar_den = dec->fmt_in.video.i_sar_den;
++ }
++ else
++ {
++ fmt->i_sar_num = ctx->sample_aspect_ratio.num;
++ fmt->i_sar_den = ctx->sample_aspect_ratio.den;
++
++ if (fmt->i_sar_num == 0 || fmt->i_sar_den == 0)
++ fmt->i_sar_num = fmt->i_sar_den = 1;
++ }
++
++ if (dec->fmt_in.video.i_frame_rate > 0
++ && dec->fmt_in.video.i_frame_rate_base > 0)
++ {
++ fmt->i_frame_rate = dec->fmt_in.video.i_frame_rate;
++ fmt->i_frame_rate_base = dec->fmt_in.video.i_frame_rate_base;
++ }
++ else if (ctx->framerate.num > 0 && ctx->framerate.den > 0)
++ {
++ fmt->i_frame_rate = ctx->framerate.num;
++ fmt->i_frame_rate_base = ctx->framerate.den;
++# if LIBAVCODEC_VERSION_MICRO < 100
++ // for some reason libav don't thinkg framerate presents actually same thing as in ffmpeg
++ fmt->i_frame_rate_base *= __MAX(ctx->ticks_per_frame, 1);
++# endif
++ }
++ else if (ctx->time_base.num > 0 && ctx->time_base.den > 0)
++ {
++ fmt->i_frame_rate = ctx->time_base.den;
++ fmt->i_frame_rate_base = ctx->time_base.num
++ * __MAX(ctx->ticks_per_frame, 1);
++ }
++
++ /* FIXME we should only set the known values and let the core decide
++ * later of fallbacks, but we can't do that with a boolean */
++ switch ( ctx->color_range )
++ {
++ case AVCOL_RANGE_JPEG:
++ fmt->b_color_range_full = true;
++ break;
++ case AVCOL_RANGE_UNSPECIFIED:
++ fmt->b_color_range_full = !vlc_fourcc_IsYUV( fmt->i_chroma );
++ break;
++ case AVCOL_RANGE_MPEG:
++ default:
++ fmt->b_color_range_full = false;
++ break;
++ }
++
++ switch( ctx->colorspace )
++ {
++ case AVCOL_SPC_BT709:
++ fmt->space = COLOR_SPACE_BT709;
++ break;
++ case AVCOL_SPC_SMPTE170M:
++ case AVCOL_SPC_BT470BG:
++ fmt->space = COLOR_SPACE_BT601;
++ break;
++ case AVCOL_SPC_BT2020_NCL:
++ case AVCOL_SPC_BT2020_CL:
++ fmt->space = COLOR_SPACE_BT2020;
++ break;
++ default:
++ break;
++ }
++
++ switch( ctx->color_trc )
++ {
++ case AVCOL_TRC_LINEAR:
++ fmt->transfer = TRANSFER_FUNC_LINEAR;
++ break;
++ case AVCOL_TRC_GAMMA22:
++ fmt->transfer = TRANSFER_FUNC_SRGB;
++ break;
++ case AVCOL_TRC_BT709:
++ fmt->transfer = TRANSFER_FUNC_BT709;
++ break;
++ case AVCOL_TRC_SMPTE170M:
++ case AVCOL_TRC_BT2020_10:
++ case AVCOL_TRC_BT2020_12:
++ fmt->transfer = TRANSFER_FUNC_BT2020;
++ break;
++#if LIBAVUTIL_VERSION_CHECK( 55, 14, 0, 31, 100)
++ case AVCOL_TRC_ARIB_STD_B67:
++ fmt->transfer = TRANSFER_FUNC_ARIB_B67;
++ break;
++#endif
++#if LIBAVUTIL_VERSION_CHECK( 55, 17, 0, 37, 100)
++ case AVCOL_TRC_SMPTE2084:
++ fmt->transfer = TRANSFER_FUNC_SMPTE_ST2084;
++ break;
++ case AVCOL_TRC_SMPTE240M:
++ fmt->transfer = TRANSFER_FUNC_SMPTE_240;
++ break;
++ case AVCOL_TRC_GAMMA28:
++ fmt->transfer = TRANSFER_FUNC_BT470_BG;
++ break;
++#endif
++ default:
++ break;
++ }
++
++ switch( ctx->color_primaries )
++ {
++ case AVCOL_PRI_BT709:
++ fmt->primaries = COLOR_PRIMARIES_BT709;
++ break;
++ case AVCOL_PRI_BT470BG:
++ fmt->primaries = COLOR_PRIMARIES_BT601_625;
++ break;
++ case AVCOL_PRI_SMPTE170M:
++ case AVCOL_PRI_SMPTE240M:
++ fmt->primaries = COLOR_PRIMARIES_BT601_525;
++ break;
++ case AVCOL_PRI_BT2020:
++ fmt->primaries = COLOR_PRIMARIES_BT2020;
++ break;
++ default:
++ break;
++ }
++
++ switch( ctx->chroma_sample_location )
++ {
++ case AVCHROMA_LOC_LEFT:
++ fmt->chroma_location = CHROMA_LOCATION_LEFT;
++ break;
++ case AVCHROMA_LOC_CENTER:
++ fmt->chroma_location = CHROMA_LOCATION_CENTER;
++ break;
++ case AVCHROMA_LOC_TOPLEFT:
++ fmt->chroma_location = CHROMA_LOCATION_TOP_LEFT;
++ break;
++ default:
++ break;
++ }
++
++ return 0;
++}
++
++static int lavc_UpdateVideoFormat(decoder_t *dec, AVCodecContext *ctx,
++ enum AVPixelFormat fmt,
++ enum AVPixelFormat swfmt)
++{
++ video_format_t fmt_out;
++ int val;
++#if TRACE_ALL
++ msg_Dbg(dec, "<<< %s", __func__);
++#endif
++ val = lavc_GetVideoFormat(dec, &fmt_out, ctx, fmt, swfmt);
++ if (val)
++ {
++ msg_Dbg(dec, "Failed to get format");
++ return val;
++ }
++
++ /* always have date in fields/ticks units */
++ if(dec->p_sys->pts.i_divider_num)
++ date_Change(&dec->p_sys->pts, fmt_out.i_frame_rate *
++ __MAX(ctx->ticks_per_frame, 1),
++ fmt_out.i_frame_rate_base);
++ else
++ date_Init(&dec->p_sys->pts, fmt_out.i_frame_rate *
++ __MAX(ctx->ticks_per_frame, 1),
++ fmt_out.i_frame_rate_base);
++
++ fmt_out.p_palette = dec-> fmt_out.video.p_palette;
++ dec->fmt_out.video.p_palette = NULL;
++
++ es_format_Change(&dec->fmt_out, VIDEO_ES, fmt_out.i_chroma);
++ dec->fmt_out.video = fmt_out;
++ dec->fmt_out.video.orientation = dec->fmt_in.video.orientation;
++ dec->fmt_out.video.projection_mode = dec->fmt_in.video.projection_mode;
++ dec->fmt_out.video.multiview_mode = dec->fmt_in.video.multiview_mode;
++ dec->fmt_out.video.pose = dec->fmt_in.video.pose;
++ if ( dec->fmt_in.video.mastering.max_luminance )
++ dec->fmt_out.video.mastering = dec->fmt_in.video.mastering;
++ dec->fmt_out.video.lighting = dec->fmt_in.video.lighting;
++
++ val = decoder_UpdateVideoFormat(dec);
++#if TRACE_ALL
++ msg_Dbg(dec, ">>> %s: rv=%d", __func__, val);
++#endif
++ return val;
++}
++
++static int OpenVideoCodec( decoder_t *p_dec )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *ctx = p_sys->p_context;
++ const AVCodec *codec = p_sys->p_codec;
++ int ret;
++
++ if( ctx->extradata_size <= 0 )
++ {
++ if( codec->id == AV_CODEC_ID_VC1 ||
++ codec->id == AV_CODEC_ID_THEORA )
++ {
++ msg_Warn( p_dec, "waiting for extra data for codec %s",
++ codec->name );
++ return 1;
++ }
++ }
++
++ ctx->width = p_dec->fmt_in.video.i_visible_width;
++ ctx->height = p_dec->fmt_in.video.i_visible_height;
++
++ ctx->coded_width = p_dec->fmt_in.video.i_width;
++ ctx->coded_height = p_dec->fmt_in.video.i_height;
++
++ ctx->bits_per_coded_sample = p_dec->fmt_in.video.i_bits_per_pixel;
++ p_sys->pix_fmt = AV_PIX_FMT_NONE;
++ p_sys->profile = -1;
++ p_sys->level = -1;
++ cc_Init( &p_sys->cc );
++
++ set_video_color_settings( &p_dec->fmt_in.video, ctx );
++ if( p_dec->fmt_in.video.i_frame_rate_base &&
++ p_dec->fmt_in.video.i_frame_rate &&
++ (double) p_dec->fmt_in.video.i_frame_rate /
++ p_dec->fmt_in.video.i_frame_rate_base < 6 )
++ {
++ ctx->flags |= AV_CODEC_FLAG_LOW_DELAY;
++ }
++
++ post_mt( p_sys );
++ ret = ZcFfmpeg_OpenCodec( p_dec, ctx, codec );
++ wait_mt( p_sys );
++ if( ret < 0 )
++ return ret;
++
++ switch( ctx->active_thread_type )
++ {
++ case FF_THREAD_FRAME:
++ msg_Dbg( p_dec, "using frame thread mode with %d threads",
++ ctx->thread_count );
++ break;
++ case FF_THREAD_SLICE:
++ msg_Dbg( p_dec, "using slice thread mode with %d threads",
++ ctx->thread_count );
++ break;
++ case 0:
++ if( ctx->thread_count > 1 )
++ msg_Warn( p_dec, "failed to enable threaded decoding" );
++ break;
++ default:
++ msg_Warn( p_dec, "using unknown thread mode with %d threads",
++ ctx->thread_count );
++ break;
++ }
++ return 0;
++}
++
++/*****************************************************************************
++ * InitVideo: initialize the video decoder
++ *****************************************************************************
++ * the ffmpeg codec will be opened, some memory allocated. The vout is not yet
++ * opened (done after the first decoded frame).
++ *****************************************************************************/
++static int MmalAvcodecOpenDecoder( vlc_object_t *obj )
++{
++ decoder_t *p_dec = (decoder_t *)obj;
++ const AVCodec *p_codec;
++
++ int extra_buffers = var_InheritInteger(p_dec, MMAL_AVCODEC_BUFFERS);
++
++ if (extra_buffers < 0)
++ {
++ extra_buffers = p_dec->fmt_in.video.i_height * p_dec->fmt_in.video.i_width >= 1920 * 1088 ?
++ BUFFERS_IN_FLIGHT_UHD : BUFFERS_IN_FLIGHT;
++ }
++
++ if (extra_buffers <= 0)
++ {
++ msg_Dbg(p_dec, "%s: extra_buffers=%d - cannot use module", __func__, extra_buffers);
++ return VLC_EGENERIC;
++ }
++
++ const vcsm_init_type_t vcsm_type = cma_vcsm_init();
++ const int vcsm_size =
++ vcsm_type == VCSM_INIT_LEGACY ? hw_mmal_get_gpu_mem() : 512 << 20;
++
++#if 1
++ {
++ char buf1[5], buf2[5], buf2a[5];
++ char buf3[5], buf4[5];
++ uint32_t in_fcc = 0;
++ msg_Dbg(p_dec, "%s: <<< (%s/%s)[%s] %dx%d -> (%s/%s) %dx%d [%s/%d] xb:%d", __func__,
++ str_fourcc(buf1, p_dec->fmt_in.i_codec),
++ str_fourcc(buf2, p_dec->fmt_in.video.i_chroma),
++ str_fourcc(buf2a, in_fcc),
++ p_dec->fmt_in.video.i_width, p_dec->fmt_in.video.i_height,
++ str_fourcc(buf3, p_dec->fmt_out.i_codec),
++ str_fourcc(buf4, p_dec->fmt_out.video.i_chroma),
++ p_dec->fmt_out.video.i_width, p_dec->fmt_out.video.i_height,
++ cma_vcsm_init_str(vcsm_type), vcsm_size, extra_buffers);
++ }
++#endif
++
++ if( vcsm_type == VCSM_INIT_NONE )
++ return VLC_EGENERIC;
++#if 1
++ if( (p_dec->fmt_in.i_codec != VLC_CODEC_HEVC &&
++ (vcsm_type == VCSM_INIT_CMA || vcsm_size < (96 << 20))) ||
++ (p_dec->fmt_in.i_codec == VLC_CODEC_HEVC &&
++ vcsm_size < (128 << 20)))
++ {
++ cma_vcsm_exit(vcsm_type);
++ return VLC_EGENERIC;
++ }
++#endif
++
++ AVCodecContext *p_context = ZcFfmpeg_AllocContext( p_dec, &p_codec );
++ if( p_context == NULL )
++ {
++ cma_vcsm_exit(vcsm_type);
++ return VLC_EGENERIC;
++ }
++
++ int i_val;
++
++ /* Allocate the memory needed to store the decoder's structure */
++ decoder_sys_t *p_sys = calloc( 1, sizeof(*p_sys) );
++ if( unlikely(p_sys == NULL) )
++ {
++ avcodec_free_context( &p_context );
++ cma_vcsm_exit(vcsm_type);
++ return VLC_ENOMEM;
++ }
++
++ p_dec->p_sys = p_sys;
++ p_sys->p_context = p_context;
++ p_sys->p_codec = p_codec;
++ p_sys->p_dec = p_dec;
++// p_sys->p_va = NULL;
++ p_sys->cma_in_flight_max = extra_buffers;
++ p_sys->vcsm_init_type = vcsm_type;
++ vlc_sem_init( &p_sys->sem_mt, 0 );
++
++ /* ***** Fill p_context with init values ***** */
++ p_context->codec_tag = ffmpeg_CodecTag( p_dec->fmt_in.i_original_fourcc ?
++ p_dec->fmt_in.i_original_fourcc : p_dec->fmt_in.i_codec );
++
++ /* ***** Get configuration of ffmpeg plugin ***** */
++ p_context->workaround_bugs =
++ var_InheritInteger( p_dec, "avcodec-workaround-bugs" );
++ p_context->err_recognition =
++ var_InheritInteger( p_dec, "avcodec-error-resilience" );
++
++ if( var_CreateGetBool( p_dec, "grayscale" ) )
++ p_context->flags |= AV_CODEC_FLAG_GRAY;
++
++ /* ***** Output always the frames ***** */
++ p_context->flags |= AV_CODEC_FLAG_OUTPUT_CORRUPT;
++
++ i_val = var_CreateGetInteger( p_dec, "avcodec-skiploopfilter" );
++ if( i_val >= 4 ) p_context->skip_loop_filter = AVDISCARD_ALL;
++ else if( i_val == 3 ) p_context->skip_loop_filter = AVDISCARD_NONKEY;
++ else if( i_val == 2 ) p_context->skip_loop_filter = AVDISCARD_BIDIR;
++ else if( i_val == 1 ) p_context->skip_loop_filter = AVDISCARD_NONREF;
++ else p_context->skip_loop_filter = AVDISCARD_DEFAULT;
++
++ if( var_CreateGetBool( p_dec, "avcodec-fast" ) )
++ p_context->flags2 |= AV_CODEC_FLAG2_FAST;
++
++ /* ***** libavcodec frame skipping ***** */
++ p_sys->b_hurry_up = var_CreateGetBool( p_dec, "avcodec-hurry-up" );
++ p_sys->b_show_corrupted = var_CreateGetBool( p_dec, "avcodec-corrupted" );
++
++ i_val = var_CreateGetInteger( p_dec, "avcodec-skip-frame" );
++ if( i_val >= 4 ) p_sys->i_skip_frame = AVDISCARD_ALL;
++ else if( i_val == 3 ) p_sys->i_skip_frame = AVDISCARD_NONKEY;
++ else if( i_val == 2 ) p_sys->i_skip_frame = AVDISCARD_BIDIR;
++ else if( i_val == 1 ) p_sys->i_skip_frame = AVDISCARD_NONREF;
++ else if( i_val == -1 ) p_sys->i_skip_frame = AVDISCARD_NONE;
++ else p_sys->i_skip_frame = AVDISCARD_DEFAULT;
++ p_context->skip_frame = p_sys->i_skip_frame;
++
++ i_val = var_CreateGetInteger( p_dec, "avcodec-skip-idct" );
++ if( i_val >= 4 ) p_context->skip_idct = AVDISCARD_ALL;
++ else if( i_val == 3 ) p_context->skip_idct = AVDISCARD_NONKEY;
++ else if( i_val == 2 ) p_context->skip_idct = AVDISCARD_BIDIR;
++ else if( i_val == 1 ) p_context->skip_idct = AVDISCARD_NONREF;
++ else if( i_val == -1 ) p_context->skip_idct = AVDISCARD_NONE;
++ else p_context->skip_idct = AVDISCARD_DEFAULT;
++
++ /* ***** libavcodec direct rendering ***** */
++ p_sys->b_direct_rendering = false;
++ atomic_init(&p_sys->b_dr_failure, false);
++ if( var_CreateGetBool( p_dec, "avcodec-dr" ) &&
++ (p_codec->capabilities & AV_CODEC_CAP_DR1) &&
++ /* No idea why ... but this fixes flickering on some TSCC streams */
++ p_sys->p_codec->id != AV_CODEC_ID_TSCC &&
++ p_sys->p_codec->id != AV_CODEC_ID_CSCD &&
++ p_sys->p_codec->id != AV_CODEC_ID_CINEPAK )
++ {
++ /* Some codecs set pix_fmt only after the 1st frame has been decoded,
++ * so we need to do another check in ffmpeg_GetFrameBuf() */
++ p_sys->b_direct_rendering = true;
++ }
++
++ p_context->get_format = ZcGetFormat;
++#if 0
++ p_context->get_format = ffmpeg_GetFormat;
++ /* Always use our get_buffer wrapper so we can calculate the
++ * PTS correctly */
++ p_context->get_buffer2 = lavc_GetFrame;
++ p_context->opaque = p_dec;
++#endif
++
++ int i_thread_count = var_InheritInteger( p_dec, "avcodec-threads" );
++ if( i_thread_count <= 0 )
++#if 1
++ {
++ // Pick 5 threads for everything on Pi except for HEVC where the h/w
++ // really limits the useful size to 3
++ i_thread_count = p_codec->id == AV_CODEC_ID_HEVC ? 3 : 5;
++ }
++#else
++ {
++ i_thread_count = vlc_GetCPUCount();
++ if( i_thread_count > 1 )
++ i_thread_count++;
++
++ //FIXME: take in count the decoding time
++#if VLC_WINSTORE_APP
++ i_thread_count = __MIN( i_thread_count, 6 );
++#else
++ i_thread_count = __MIN( i_thread_count, p_codec->id == AV_CODEC_ID_HEVC ? 10 : 6 );
++#endif
++ }
++ i_thread_count = __MIN( i_thread_count, p_codec->id == AV_CODEC_ID_HEVC ? 32 : 16 );
++#endif
++ msg_Dbg( p_dec, "allowing %d thread(s) for decoding", i_thread_count );
++ p_context->thread_count = i_thread_count;
++ p_context->thread_safe_callbacks = true;
++
++ switch( p_codec->id )
++ {
++ case AV_CODEC_ID_MPEG4:
++ case AV_CODEC_ID_H263:
++ p_context->thread_type = 0;
++ break;
++ case AV_CODEC_ID_MPEG1VIDEO:
++ case AV_CODEC_ID_MPEG2VIDEO:
++ p_context->thread_type &= ~FF_THREAD_SLICE;
++ /* fall through */
++# if (LIBAVCODEC_VERSION_INT < AV_VERSION_INT(55, 1, 0))
++ case AV_CODEC_ID_H264:
++ case AV_CODEC_ID_VC1:
++ case AV_CODEC_ID_WMV3:
++ p_context->thread_type &= ~FF_THREAD_FRAME;
++# endif
++ default:
++ break;
++ }
++
++ if( p_context->thread_type & FF_THREAD_FRAME )
++ p_dec->i_extra_picture_buffers = 2 * p_context->thread_count;
++
++ /* ***** misc init ***** */
++ date_Init(&p_sys->pts, 1, 30001);
++ date_Set(&p_sys->pts, VLC_TS_INVALID);
++ p_sys->b_first_frame = true;
++ p_sys->i_late_frames = 0;
++ p_sys->b_from_preroll = false;
++
++ /* Set output properties */
++ if( ZcGetVlcChroma( &p_dec->fmt_out.video, p_context->pix_fmt ) != VLC_SUCCESS )
++ {
++ /* we are doomed. but not really, because most codecs set their pix_fmt later on */
++// p_dec->fmt_out.i_codec = VLC_CODEC_I420;
++ p_dec->fmt_out.i_codec = VLC_CODEC_MMAL_ZC_I420;
++ }
++ p_dec->fmt_out.i_codec = p_dec->fmt_out.video.i_chroma;
++
++ p_dec->fmt_out.video.orientation = p_dec->fmt_in.video.orientation;
++
++ if( p_dec->fmt_in.video.p_palette ) {
++ p_sys->palette_sent = false;
++ p_dec->fmt_out.video.p_palette = malloc( sizeof(video_palette_t) );
++ if( p_dec->fmt_out.video.p_palette )
++ *p_dec->fmt_out.video.p_palette = *p_dec->fmt_in.video.p_palette;
++ } else
++ p_sys->palette_sent = true;
++
++ if ((p_sys->cma_pool = cma_buf_pool_new(p_sys->cma_in_flight_max, p_sys->cma_in_flight_max, false, "mmal_avcodec")) == NULL)
++ {
++ msg_Err(p_dec, "CMA pool alloc failure");
++ goto fail;
++ }
++
++ /* ***** init this codec with special data ***** */
++ ffmpeg_InitCodec( p_dec );
++
++ /* ***** Open the codec ***** */
++ if( OpenVideoCodec( p_dec ) < 0 )
++ {
++ vlc_sem_destroy( &p_sys->sem_mt );
++ free( p_sys );
++ avcodec_free_context( &p_context );
++ return VLC_EGENERIC;
++ }
++
++ p_dec->pf_decode = DecodeVideo;
++ p_dec->pf_flush = Flush;
++
++ /* XXX: Writing input format makes little sense. */
++ if( p_context->profile != FF_PROFILE_UNKNOWN )
++ p_dec->fmt_in.i_profile = p_context->profile;
++ if( p_context->level != FF_LEVEL_UNKNOWN )
++ p_dec->fmt_in.i_level = p_context->level;
++
++#if 1
++ // Most of the time we have nothing useful by way of a format here
++ // wait till we've decoded something
++#else
++ // Update output format
++ if (lavc_UpdateVideoFormat(p_dec, p_context, p_context->pix_fmt,
++ p_context->pix_fmt) != 0)
++ {
++ msg_Err(p_dec, "Unable to update format: pix_fmt=%d", p_context->pix_fmt);
++// goto fail;
++ }
++#endif
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, "<<< %s: OK", __func__);
++#endif
++ return VLC_SUCCESS;
++
++fail:
++ MmalAvcodecCloseDecoder(VLC_OBJECT(p_dec));
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, "<<< %s: FAIL", __func__);
++#endif
++
++ return VLC_EGENERIC;
++}
++
++/*****************************************************************************
++ * Flush:
++ *****************************************************************************/
++static void Flush( decoder_t *p_dec )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *p_context = p_sys->p_context;
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, "<<< %s", __func__);
++#endif
++
++ date_Set(&p_sys->pts, VLC_TS_INVALID); /* To make sure we recover properly */
++ p_sys->i_late_frames = 0;
++ cc_Flush( &p_sys->cc );
++
++ /* Abort pictures in order to unblock all avcodec workers threads waiting
++ * for a picture. This will avoid a deadlock between avcodec_flush_buffers
++ * and workers threads */
++// It would probably be good to use AbortPicture but that often deadlocks on close
++// and given that we wait for pics in the main thread it should be unneeded (whereas
++// cma is alloced in the depths of ffmpeg on its own threads)
++// decoder_AbortPictures( p_dec, true );
++ cma_buf_pool_cancel(p_sys->cma_pool);
++
++ post_mt( p_sys );
++ /* do not flush buffers if codec hasn't been opened (theora/vorbis/VC1) */
++ if( avcodec_is_open( p_context ) )
++ avcodec_flush_buffers( p_context );
++ wait_mt( p_sys );
++
++ /* Reset cancel state to false */
++ cma_buf_pool_uncancel(p_sys->cma_pool);
++// decoder_AbortPictures( p_dec, false );
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, ">>> %s", __func__);
++#endif
++
++}
++
++static bool check_block_validity( decoder_sys_t *p_sys, block_t *block )
++{
++ if( !block)
++ return true;
++
++ if( block->i_flags & (BLOCK_FLAG_DISCONTINUITY|BLOCK_FLAG_CORRUPTED) )
++ {
++ date_Set( &p_sys->pts, VLC_TS_INVALID ); /* To make sure we recover properly */
++ cc_Flush( &p_sys->cc );
++
++ p_sys->i_late_frames = 0;
++ if( block->i_flags & BLOCK_FLAG_CORRUPTED )
++ {
++ block_Release( block );
++ return false;
++ }
++ }
++ return true;
++}
++
++static bool check_block_being_late( decoder_sys_t *p_sys, block_t *block, mtime_t current_time)
++{
++ if( !block )
++ return false;
++ if( block->i_flags & BLOCK_FLAG_PREROLL )
++ {
++ /* Do not care about late frames when prerolling
++ * TODO avoid decoding of non reference frame
++ * (ie all B except for H264 where it depends only on nal_ref_idc) */
++ p_sys->i_late_frames = 0;
++ p_sys->b_from_preroll = true;
++ p_sys->i_last_late_delay = INT64_MAX;
++ }
++
++ if( p_sys->i_late_frames <= 0 )
++ return false;
++
++ if( current_time - p_sys->i_late_frames_start > (5*CLOCK_FREQ))
++ {
++ date_Set( &p_sys->pts, VLC_TS_INVALID ); /* To make sure we recover properly */
++ block_Release( block );
++ p_sys->i_late_frames--;
++ return true;
++ }
++ return false;
++}
++
++static bool check_frame_should_be_dropped( decoder_sys_t *p_sys, AVCodecContext *p_context, bool *b_need_output_picture )
++{
++ if( p_sys->i_late_frames <= 4)
++ return false;
++
++ *b_need_output_picture = false;
++ if( p_sys->i_late_frames < 12 )
++ {
++ p_context->skip_frame =
++ (p_sys->i_skip_frame <= AVDISCARD_NONREF) ?
++ AVDISCARD_NONREF : p_sys->i_skip_frame;
++ }
++ else
++ {
++ /* picture too late, won't decode
++ * but break picture until a new I, and for mpeg4 ...*/
++ p_sys->i_late_frames--; /* needed else it will never be decrease */
++ return true;
++ }
++ return false;
++}
++
++static mtime_t interpolate_next_pts( decoder_t *p_dec, AVFrame *frame )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *p_context = p_sys->p_context;
++
++ if( date_Get( &p_sys->pts ) == VLC_TS_INVALID ||
++ p_sys->pts.i_divider_num == 0 )
++ return VLC_TS_INVALID;
++
++ int i_tick = p_context->ticks_per_frame;
++ if( i_tick <= 0 )
++ i_tick = 1;
++
++ /* interpolate the next PTS */
++ return date_Increment( &p_sys->pts, i_tick + frame->repeat_pict );
++}
++
++static void update_late_frame_count( decoder_t *p_dec, block_t *p_block,
++ mtime_t current_time, mtime_t i_pts,
++ mtime_t i_next_pts )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ /* Update frame late count (except when doing preroll) */
++ mtime_t i_display_date = VLC_TS_INVALID;
++ if( !p_block || !(p_block->i_flags & BLOCK_FLAG_PREROLL) )
++ i_display_date = decoder_GetDisplayDate( p_dec, i_pts );
++
++ mtime_t i_threshold = i_next_pts != VLC_TS_INVALID ? (i_next_pts - i_pts) / 2 : 20000;
++
++ if( i_display_date > VLC_TS_INVALID && i_display_date + i_threshold <= current_time )
++ {
++ /* Out of preroll, consider only late frames on rising delay */
++ if( p_sys->b_from_preroll )
++ {
++ if( p_sys->i_last_late_delay > current_time - i_display_date )
++ {
++ p_sys->i_last_late_delay = current_time - i_display_date;
++ return;
++ }
++ p_sys->b_from_preroll = false;
++ }
++
++ p_sys->i_late_frames++;
++ if( p_sys->i_late_frames == 1 )
++ p_sys->i_late_frames_start = current_time;
++
++ }
++ else
++ {
++ p_sys->i_late_frames = 0;
++ }
++}
++
++
++static int DecodeSidedata( decoder_t *p_dec, const AVFrame *frame, picture_t *p_pic )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ bool format_changed = false;
++
++#if (LIBAVUTIL_VERSION_MICRO >= 100 && LIBAVUTIL_VERSION_INT >= AV_VERSION_INT( 55, 16, 101 ) )
++#define FROM_AVRAT(default_factor, avrat) \
++(uint64_t)(default_factor) * (avrat).num / (avrat).den
++ const AVFrameSideData *metadata =
++ av_frame_get_side_data( frame,
++ AV_FRAME_DATA_MASTERING_DISPLAY_METADATA );
++ if ( metadata )
++ {
++ const AVMasteringDisplayMetadata *hdr_meta =
++ (const AVMasteringDisplayMetadata *) metadata->data;
++ if ( hdr_meta->has_luminance )
++ {
++#define ST2086_LUMA_FACTOR 10000
++ p_pic->format.mastering.max_luminance =
++ FROM_AVRAT(ST2086_LUMA_FACTOR, hdr_meta->max_luminance);
++ p_pic->format.mastering.min_luminance =
++ FROM_AVRAT(ST2086_LUMA_FACTOR, hdr_meta->min_luminance);
++ }
++ if ( hdr_meta->has_primaries )
++ {
++#define ST2086_RED 2
++#define ST2086_GREEN 0
++#define ST2086_BLUE 1
++#define LAV_RED 0
++#define LAV_GREEN 1
++#define LAV_BLUE 2
++#define ST2086_PRIM_FACTOR 50000
++ p_pic->format.mastering.primaries[ST2086_RED*2 + 0] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_RED][0]);
++ p_pic->format.mastering.primaries[ST2086_RED*2 + 1] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_RED][1]);
++ p_pic->format.mastering.primaries[ST2086_GREEN*2 + 0] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_GREEN][0]);
++ p_pic->format.mastering.primaries[ST2086_GREEN*2 + 1] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_GREEN][1]);
++ p_pic->format.mastering.primaries[ST2086_BLUE*2 + 0] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_BLUE][0]);
++ p_pic->format.mastering.primaries[ST2086_BLUE*2 + 1] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->display_primaries[LAV_BLUE][1]);
++ p_pic->format.mastering.white_point[0] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->white_point[0]);
++ p_pic->format.mastering.white_point[1] =
++ FROM_AVRAT(ST2086_PRIM_FACTOR, hdr_meta->white_point[1]);
++ }
++
++ if ( memcmp( &p_dec->fmt_out.video.mastering,
++ &p_pic->format.mastering,
++ sizeof(p_pic->format.mastering) ) )
++ {
++ p_dec->fmt_out.video.mastering = p_pic->format.mastering;
++ format_changed = true;
++ }
++#undef FROM_AVRAT
++ }
++#endif
++#if (LIBAVUTIL_VERSION_MICRO >= 100 && LIBAVUTIL_VERSION_INT >= AV_VERSION_INT( 55, 60, 100 ) )
++ const AVFrameSideData *metadata_lt =
++ av_frame_get_side_data( frame,
++ AV_FRAME_DATA_CONTENT_LIGHT_LEVEL );
++ if ( metadata_lt )
++ {
++ const AVContentLightMetadata *light_meta =
++ (const AVContentLightMetadata *) metadata_lt->data;
++ p_pic->format.lighting.MaxCLL = light_meta->MaxCLL;
++ p_pic->format.lighting.MaxFALL = light_meta->MaxFALL;
++ if ( memcmp( &p_dec->fmt_out.video.lighting,
++ &p_pic->format.lighting,
++ sizeof(p_pic->format.lighting) ) )
++ {
++ p_dec->fmt_out.video.lighting = p_pic->format.lighting;
++ format_changed = true;
++ }
++ }
++#endif
++
++ if (format_changed && decoder_UpdateVideoFormat( p_dec ))
++ return -1;
++
++ const AVFrameSideData *p_avcc = av_frame_get_side_data( frame, AV_FRAME_DATA_A53_CC );
++ if( p_avcc )
++ {
++ cc_Extract( &p_sys->cc, CC_PAYLOAD_RAW, true, p_avcc->data, p_avcc->size );
++ if( p_sys->cc.b_reorder || p_sys->cc.i_data )
++ {
++ block_t *p_cc = block_Alloc( p_sys->cc.i_data );
++ if( p_cc )
++ {
++ memcpy( p_cc->p_buffer, p_sys->cc.p_data, p_sys->cc.i_data );
++ if( p_sys->cc.b_reorder )
++ p_cc->i_dts = p_cc->i_pts = p_pic->date;
++ else
++ p_cc->i_pts = p_cc->i_dts;
++ decoder_cc_desc_t desc;
++ desc.i_608_channels = p_sys->cc.i_608channels;
++ desc.i_708_channels = p_sys->cc.i_708channels;
++ desc.i_reorder_depth = 4;
++ decoder_QueueCc( p_dec, p_cc, &desc );
++ }
++ cc_Flush( &p_sys->cc );
++ }
++ }
++ return 0;
++}
++
++/*****************************************************************************
++ * DecodeBlock: Called to decode one or more frames
++ *****************************************************************************/
++
++static picture_t *DecodeBlock( decoder_t *p_dec, block_t **pp_block, bool *error )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *p_context = p_sys->p_context;
++ /* Boolean if we assume that we should get valid pic as result */
++ bool b_need_output_picture = true;
++
++ /* Boolean for END_OF_SEQUENCE */
++ bool eos_spotted = false;
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, "<<< %s: (buf_size=%d)", __func__, pp_block == NULL || *pp_block == NULL ? 0 : (*pp_block)->i_buffer);
++#endif
++
++ block_t *p_block;
++ mtime_t current_time;
++ picture_t *p_pic = NULL;
++ AVFrame *frame = NULL;
++
++ // By default we are OK
++ *error = false;
++
++ if( !p_context->extradata_size && p_dec->fmt_in.i_extra )
++ {
++ ffmpeg_InitCodec( p_dec );
++ if( !avcodec_is_open( p_context ) )
++ OpenVideoCodec( p_dec );
++ }
++
++ p_block = pp_block ? *pp_block : NULL;
++ if(!p_block && !(p_sys->p_codec->capabilities & AV_CODEC_CAP_DELAY) )
++ return NULL;
++
++ if( !avcodec_is_open( p_context ) )
++ {
++ if( p_block )
++ block_Release( p_block );
++ return NULL;
++ }
++
++ if( !check_block_validity( p_sys, p_block ) )
++ return NULL;
++
++ current_time = mdate();
++ if( p_dec->b_frame_drop_allowed && check_block_being_late( p_sys, p_block, current_time) )
++ {
++ msg_Err( p_dec, "more than 5 seconds of late video -> "
++ "dropping frame (computer too slow ?)" );
++ return NULL;
++ }
++
++
++ /* A good idea could be to decode all I pictures and see for the other */
++
++ /* Defaults that if we aren't in prerolling, we want output picture
++ same for if we are flushing (p_block==NULL) */
++ if( !p_block || !(p_block->i_flags & BLOCK_FLAG_PREROLL) )
++ b_need_output_picture = true;
++ else
++ b_need_output_picture = false;
++
++ /* Change skip_frame config only if hurry_up is enabled */
++ if( p_sys->b_hurry_up )
++ {
++ p_context->skip_frame = p_sys->i_skip_frame;
++
++ /* Check also if we should/can drop the block and move to next block
++ as trying to catchup the speed*/
++ if( p_dec->b_frame_drop_allowed &&
++ check_frame_should_be_dropped( p_sys, p_context, &b_need_output_picture ) )
++ {
++ if( p_block )
++ block_Release( p_block );
++ msg_Warn( p_dec, "More than 11 late frames, dropping frame" );
++ return NULL;
++ }
++ }
++ if( !b_need_output_picture )
++ {
++ p_context->skip_frame = __MAX( p_context->skip_frame,
++ AVDISCARD_NONREF );
++ }
++
++ /*
++ * Do the actual decoding now */
++
++ /* Don't forget that libavcodec requires a little more bytes
++ * that the real frame size */
++ if( p_block && p_block->i_buffer > 0 )
++ {
++ eos_spotted = ( p_block->i_flags & BLOCK_FLAG_END_OF_SEQUENCE ) != 0;
++
++ p_block = block_Realloc( p_block, 0,
++ p_block->i_buffer + FF_INPUT_BUFFER_PADDING_SIZE );
++ if( !p_block )
++ return NULL;
++ p_block->i_buffer -= FF_INPUT_BUFFER_PADDING_SIZE;
++ *pp_block = p_block;
++ memset( p_block->p_buffer + p_block->i_buffer, 0,
++ FF_INPUT_BUFFER_PADDING_SIZE );
++ }
++
++ while( !p_block || p_block->i_buffer > 0 || eos_spotted )
++ {
++ int i_used;
++ AVPacket pkt;
++
++ post_mt( p_sys );
++
++ av_init_packet( &pkt );
++ if( p_block && p_block->i_buffer > 0 )
++ {
++ pkt.data = p_block->p_buffer;
++ pkt.size = p_block->i_buffer;
++ pkt.pts = p_block->i_pts > VLC_TS_INVALID ? p_block->i_pts : AV_NOPTS_VALUE;
++ pkt.dts = p_block->i_dts > VLC_TS_INVALID ? p_block->i_dts : AV_NOPTS_VALUE;
++ }
++ else
++ {
++ /* Return delayed frames if codec has CODEC_CAP_DELAY */
++ pkt.data = NULL;
++ pkt.size = 0;
++ }
++
++ if( !p_sys->palette_sent )
++ {
++ uint8_t *pal = av_packet_new_side_data(&pkt, AV_PKT_DATA_PALETTE, AVPALETTE_SIZE);
++ if (pal) {
++ memcpy(pal, p_dec->fmt_in.video.p_palette->palette, AVPALETTE_SIZE);
++ p_sys->palette_sent = true;
++ }
++ }
++
++ /* Make sure we don't reuse the same timestamps twice */
++ if( p_block )
++ {
++ p_block->i_pts =
++ p_block->i_dts = VLC_TS_INVALID;
++ }
++
++ int ret = avcodec_send_packet(p_context, &pkt);
++ if( ret != 0 && ret != AVERROR(EAGAIN) )
++ {
++ if (ret == AVERROR(ENOMEM) || ret == AVERROR(EINVAL))
++ {
++ msg_Err(p_dec, "avcodec_send_packet critical error");
++ *error = true;
++ }
++ av_packet_unref( &pkt );
++ break;
++ }
++ i_used = ret != AVERROR(EAGAIN) ? pkt.size : 0;
++ av_packet_unref( &pkt );
++
++ frame = av_frame_alloc();
++ if (unlikely(frame == NULL))
++ {
++ *error = true;
++ break;
++ }
++
++ ret = avcodec_receive_frame(p_context, frame);
++ if( ret != 0 && ret != AVERROR(EAGAIN) )
++ {
++ msg_Dbg(p_dec, "No receive");
++ if (ret == AVERROR(ENOMEM) || ret == AVERROR(EINVAL))
++ {
++ msg_Err(p_dec, "avcodec_receive_frame critical error");
++ *error = true;
++ }
++ av_frame_free(&frame);
++ /* After draining, we need to reset decoder with a flush */
++ if( ret == AVERROR_EOF )
++ avcodec_flush_buffers( p_sys->p_context );
++ break;
++ }
++ bool not_received_frame = ret;
++
++ wait_mt( p_sys );
++
++ if( eos_spotted )
++ p_sys->b_first_frame = true;
++
++ if( p_block )
++ {
++ if( p_block->i_buffer <= 0 )
++ eos_spotted = false;
++
++ /* Consumed bytes */
++ p_block->p_buffer += i_used;
++ p_block->i_buffer -= i_used;
++ }
++
++ /* Nothing to display */
++ if( not_received_frame )
++ {
++// msg_Dbg(p_dec, "No rx: used=%d", i_used);
++ av_frame_free(&frame);
++ if( i_used == 0 ) break;
++ continue;
++ }
++
++ /* Compute the PTS */
++#ifdef FF_API_PKT_PTS
++ mtime_t i_pts = frame->pts;
++#else
++ mtime_t i_pts = frame->pkt_pts;
++#endif
++ if (i_pts == AV_NOPTS_VALUE )
++ i_pts = frame->pkt_dts;
++
++ if( i_pts == AV_NOPTS_VALUE )
++ i_pts = date_Get( &p_sys->pts );
++
++ /* Interpolate the next PTS */
++ if( i_pts > VLC_TS_INVALID )
++ date_Set( &p_sys->pts, i_pts );
++
++ const mtime_t i_next_pts = interpolate_next_pts(p_dec, frame);
++
++ update_late_frame_count( p_dec, p_block, current_time, i_pts, i_next_pts);
++
++ if( !b_need_output_picture ||
++// ( !p_sys->p_va && !frame->linesize[0] ) ||
++ ( !frame->linesize[0] ) ||
++ ( p_dec->b_frame_drop_allowed && (frame->flags & AV_FRAME_FLAG_CORRUPT) &&
++ !p_sys->b_show_corrupted ) )
++ {
++ av_frame_free(&frame);
++// msg_Dbg(p_dec, "Bad frame");
++ continue;
++ }
++
++ if( p_context->pix_fmt == AV_PIX_FMT_PAL8
++ && !p_dec->fmt_out.video.p_palette )
++ {
++ /* See AV_PIX_FMT_PAL8 comment in avc_GetVideoFormat(): update the
++ * fmt_out palette and change the fmt_out chroma to request a new
++ * vout */
++ assert( p_dec->fmt_out.video.i_chroma != VLC_CODEC_RGBP );
++
++ video_palette_t *p_palette;
++ p_palette = p_dec->fmt_out.video.p_palette
++ = malloc( sizeof(video_palette_t) );
++ if( !p_palette )
++ {
++ *error = true;
++ av_frame_free(&frame);
++ break;
++ }
++ static_assert( sizeof(p_palette->palette) == AVPALETTE_SIZE,
++ "Palette size mismatch between vlc and libavutil" );
++ assert( frame->data[1] != NULL );
++ memcpy( p_palette->palette, frame->data[1], AVPALETTE_SIZE );
++ p_palette->i_entries = AVPALETTE_COUNT;
++ p_dec->fmt_out.video.i_chroma = VLC_CODEC_RGBP;
++ if( decoder_UpdateVideoFormat( p_dec ) )
++ {
++ av_frame_free(&frame);
++ continue;
++ }
++ }
++
++#if 1
++ {
++ cma_buf_t * const cb = av_rpi_zc_buf_v(frame->buf[0]);
++
++ if (cb == NULL)
++ {
++ msg_Err(p_dec, "Frame has no attached CMA buffer");
++ goto fail;
++ }
++
++ if (lavc_UpdateVideoFormat(p_dec, p_context, p_context->pix_fmt,
++ p_context->pix_fmt) != 0)
++ {
++ msg_Err(p_dec, "Failed to update format");
++ goto fail;
++ }
++
++ if ((p_pic = decoder_NewPicture(p_dec)) == NULL)
++ {
++ msg_Err(p_dec, "Failed to allocate pic");
++ goto fail;
++ }
++
++ if (cma_buf_pic_attach(cma_buf_ref(cb), p_pic) != 0)
++ {
++ cma_buf_unref(cb); // Undo the in_flight
++ char dbuf0[5];
++ msg_Err(p_dec, "Failed to attach bufs to pic: fmt=%s", str_fourcc(dbuf0, p_pic->format.i_chroma));
++ goto fail;
++ }
++
++ // ****** Set planes etc.
++ set_pic_from_frame(p_pic, frame);
++ }
++#else
++ picture_t *p_pic = frame->opaque;
++ if( p_pic == NULL )
++ { /* When direct rendering is not used, get_format() and get_buffer()
++ * might not be called. The output video format must be set here
++ * then picture buffer can be allocated. */
++ if (p_sys->p_va == NULL
++ && lavc_UpdateVideoFormat(p_dec, p_context, p_context->pix_fmt,
++ p_context->pix_fmt) == 0)
++ p_pic = decoder_NewPicture(p_dec);
++
++ if( !p_pic )
++ {
++ av_frame_free(&frame);
++ break;
++ }
++
++ /* Fill picture_t from AVFrame */
++ if( lavc_CopyPicture( p_dec, p_pic, frame ) != VLC_SUCCESS )
++ {
++ av_frame_free(&frame);
++ picture_Release( p_pic );
++ break;
++ }
++ }
++ else
++ {
++ /* Some codecs can return the same frame multiple times. By the
++ * time that the same frame is returned a second time, it will be
++ * too late to clone the underlying picture. So clone proactively.
++ * A single picture CANNOT be queued multiple times.
++ */
++ p_pic = picture_Clone( p_pic );
++ if( unlikely(p_pic == NULL) )
++ {
++ av_frame_free(&frame);
++ break;
++ }
++ }
++#endif
++
++ if( !p_dec->fmt_in.video.i_sar_num || !p_dec->fmt_in.video.i_sar_den )
++ {
++ /* Fetch again the aspect ratio in case it changed */
++ p_dec->fmt_out.video.i_sar_num
++ = p_context->sample_aspect_ratio.num;
++ p_dec->fmt_out.video.i_sar_den
++ = p_context->sample_aspect_ratio.den;
++
++ if( !p_dec->fmt_out.video.i_sar_num || !p_dec->fmt_out.video.i_sar_den )
++ {
++ p_dec->fmt_out.video.i_sar_num = 1;
++ p_dec->fmt_out.video.i_sar_den = 1;
++ }
++ }
++
++ p_pic->date = i_pts;
++ /* Hack to force display of still pictures */
++ p_pic->b_force = p_sys->b_first_frame;
++ p_pic->i_nb_fields = 2 + frame->repeat_pict;
++ p_pic->b_progressive = !frame->interlaced_frame;
++ p_pic->b_top_field_first = frame->top_field_first;
++
++ if (DecodeSidedata(p_dec, frame, p_pic))
++ i_pts = VLC_TS_INVALID;
++
++ av_frame_free(&frame);
++
++ /* Send decoded frame to vout */
++ if (i_pts > VLC_TS_INVALID)
++ {
++ p_sys->b_first_frame = false;
++#if TRACE_ALL
++ msg_Dbg(p_dec, ">>> %s: Got pic", __func__);
++#endif
++ return p_pic;
++ }
++ else
++ picture_Release( p_pic );
++ }
++
++ if( p_block )
++ block_Release( p_block );
++
++#if TRACE_ALL
++ msg_Dbg(p_dec, ">>> %s: NULL", __func__);
++#endif
++ return NULL;
++
++fail:
++#if TRACE_ALL
++ msg_Dbg(p_dec, ">>> %s: FAIL", __func__);
++#endif
++ av_frame_free(&frame);
++ if (p_pic != NULL)
++ picture_Release(p_pic);
++ if (p_block != NULL)
++ block_Release(p_block);
++ *error = true;
++ return NULL;
++}
++
++static int DecodeVideo( decoder_t *p_dec, block_t *p_block )
++{
++ block_t **pp_block = p_block ? &p_block : NULL;
++ picture_t *p_pic;
++ bool error = false;
++ while( ( p_pic = DecodeBlock( p_dec, pp_block, &error ) ) != NULL )
++ decoder_QueueVideo( p_dec, p_pic );
++ return VLCDEC_SUCCESS;
++// Easiest to just ignore all errors - returning a real error seems to
++// kill output forever
++// return error ? VLCDEC_ECRITICAL : VLCDEC_SUCCESS;
++}
++
++/*****************************************************************************
++ * EndVideo: decoder destruction
++ *****************************************************************************
++ * This function is called when the thread ends after a successful
++ * initialization.
++ *****************************************************************************/
++static void MmalAvcodecCloseDecoder( vlc_object_t *obj )
++{
++ decoder_t *p_dec = (decoder_t *)obj;
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ AVCodecContext *ctx = p_sys->p_context;
++// void *hwaccel_context;
++
++ msg_Dbg(obj, "<<< %s", __func__);
++
++ post_mt( p_sys );
++
++ cma_buf_pool_cancel(p_sys->cma_pool); // Abort any pending frame allocs
++
++ /* do not flush buffers if codec hasn't been opened (theora/vorbis/VC1) */
++ if( avcodec_is_open( ctx ) )
++ avcodec_flush_buffers( ctx );
++
++ av_rpi_zc_uninit2(ctx);
++
++ wait_mt( p_sys );
++
++ cc_Flush( &p_sys->cc );
++
++// hwaccel_context = ctx->hwaccel_context;
++ avcodec_free_context( &ctx );
++
++// if( p_sys->p_va )
++// vlc_va_Delete( p_sys->p_va, &hwaccel_context );
++
++ cma_vcsm_exit(p_sys->vcsm_init_type);
++
++ vlc_sem_destroy( &p_sys->sem_mt );
++ free( p_sys );
++}
++
++/*****************************************************************************
++ * ffmpeg_InitCodec: setup codec extra initialization data for ffmpeg
++ *****************************************************************************/
++static void ffmpeg_InitCodec( decoder_t *p_dec )
++{
++ decoder_sys_t *p_sys = p_dec->p_sys;
++ size_t i_size = p_dec->fmt_in.i_extra;
++
++ if( !i_size ) return;
++
++ if( p_sys->p_codec->id == AV_CODEC_ID_SVQ3 )
++ {
++ uint8_t *p;
++
++ p_sys->p_context->extradata_size = i_size + 12;
++ p = p_sys->p_context->extradata =
++ av_malloc( p_sys->p_context->extradata_size +
++ FF_INPUT_BUFFER_PADDING_SIZE );
++ if( !p )
++ return;
++
++ memcpy( &p[0], "SVQ3", 4 );
++ memset( &p[4], 0, 8 );
++ memcpy( &p[12], p_dec->fmt_in.p_extra, i_size );
++
++ /* Now remove all atoms before the SMI one */
++ if( p_sys->p_context->extradata_size > 0x5a &&
++ strncmp( (char*)&p[0x56], "SMI ", 4 ) )
++ {
++ uint8_t *psz = &p[0x52];
++
++ while( psz < &p[p_sys->p_context->extradata_size - 8] )
++ {
++ uint_fast32_t atom_size = GetDWBE( psz );
++ if( atom_size <= 1 )
++ {
++ /* FIXME handle 1 as long size */
++ break;
++ }
++ if( !strncmp( (char*)&psz[4], "SMI ", 4 ) )
++ {
++ memmove( &p[0x52], psz,
++ &p[p_sys->p_context->extradata_size] - psz );
++ break;
++ }
++
++ psz += atom_size;
++ }
++ }
++ }
++ else
++ {
++ p_sys->p_context->extradata_size = i_size;
++ p_sys->p_context->extradata =
++ av_malloc( i_size + FF_INPUT_BUFFER_PADDING_SIZE );
++ if( p_sys->p_context->extradata )
++ {
++ memcpy( p_sys->p_context->extradata,
++ p_dec->fmt_in.p_extra, i_size );
++ memset( p_sys->p_context->extradata + i_size,
++ 0, FF_INPUT_BUFFER_PADDING_SIZE );
++ }
++ }
++}
++
++
++vlc_module_begin()
++ set_category( CAT_INPUT )
++ set_subcategory( SUBCAT_INPUT_VCODEC )
++ set_shortname(N_("MMAL avcodec"))
++ set_description(N_("MMAL buffered avcodec "))
++ set_capability("video decoder", 80)
++ add_shortcut("mmal_avcodec")
++ add_integer(MMAL_AVCODEC_BUFFERS, -1, MMAL_AVCODEC_BUFFERS_TEXT,
++ MMAL_AVCODEC_BUFFERS_LONGTEXT, true)
++ set_callbacks(MmalAvcodecOpenDecoder, MmalAvcodecCloseDecoder)
++vlc_module_end()
++
+--- /dev/null
++++ b/modules/hw/mmal/mmal_cma.c
+@@ -0,0 +1,668 @@
++#ifdef HAVE_CONFIG_H
++# include "config.h"
++#endif
++
++#include <stdatomic.h>
++#include <unistd.h>
++#include <fcntl.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++
++#include <interface/vcsm/user-vcsm.h>
++
++#include <vlc_common.h>
++#include <vlc_picture.h>
++
++#include "mmal_cma.h"
++#include "mmal_picture.h"
++
++#include <assert.h>
++
++#define TRACE_ALL 0
++
++//-----------------------------------------------------------------------------
++//
++// Generic pool functions
++// Knows nothing about pool entries
++
++typedef void * cma_pool_alloc_fn(void * v, size_t size);
++typedef void cma_pool_free_fn(void * v, void * el, size_t size);
++
++#if TRACE_ALL
++static atomic_int pool_seq;
++#endif
++
++// Pool structure
++// Ref count is held by pool owner and pool els that have been got
++// Els in the pool do not count towards its ref count
++struct cma_pool_fixed_s
++{
++ atomic_int ref_count;
++
++ vlc_mutex_t lock;
++ unsigned int n_in;
++ unsigned int n_out;
++ unsigned int pool_size;
++ int flight_size;
++ size_t el_size;
++ void ** pool;
++
++ bool cancel;
++ int in_flight;
++ vlc_cond_t flight_cond;
++
++ void * alloc_v;
++ cma_pool_alloc_fn * el_alloc_fn;
++ cma_pool_free_fn * el_free_fn;
++ cma_pool_on_delete_fn * on_delete_fn;
++
++ const char * name;
++#if TRACE_ALL
++ int seq;
++#endif
++};
++
++static inline unsigned int inc_mod(const unsigned int n, const unsigned int m)
++{
++ return n + 1 >= m ? 0 : n + 1;
++}
++
++static void free_pool(const cma_pool_fixed_t * const p, void ** const pool,
++ const unsigned int pool_size, const size_t el_size)
++{
++ if (pool == NULL)
++ return;
++
++ for (unsigned int n = 0; n != pool_size; ++n)
++ if (pool[n] != NULL)
++ p->el_free_fn(p->alloc_v, pool[n], el_size);
++ free(pool);
++}
++
++// Just kill this - no checks
++static void cma_pool_fixed_delete(cma_pool_fixed_t * const p)
++{
++ cma_pool_on_delete_fn *const on_delete_fn = p->on_delete_fn;
++ void *const v = p->alloc_v;
++
++ free_pool(p, p->pool, p->pool_size, p->el_size);
++
++ if (p->name != NULL)
++ free((void *)p->name); // Discard const
++
++ vlc_cond_destroy(&p->flight_cond);
++ vlc_mutex_destroy(&p->lock);
++ free(p);
++
++ // Inform our container that we are dead (if it cares)
++ if (on_delete_fn)
++ on_delete_fn(v);
++}
++
++static void cma_pool_fixed_unref(cma_pool_fixed_t * const p)
++{
++ if (atomic_fetch_sub(&p->ref_count, 1) <= 1)
++ cma_pool_fixed_delete(p);
++}
++
++static void cma_pool_fixed_ref(cma_pool_fixed_t * const p)
++{
++ atomic_fetch_add(&p->ref_count, 1);
++}
++
++static void cma_pool_fixed_inc_in_flight(cma_pool_fixed_t * const p)
++{
++ vlc_mutex_lock(&p->lock);
++ ++p->in_flight;
++ vlc_mutex_unlock(&p->lock);
++}
++
++static void cma_pool_fixed_dec_in_flight(cma_pool_fixed_t * const p)
++{
++ vlc_mutex_lock(&p->lock);
++ if (--p->in_flight == 0)
++ vlc_cond_signal(&p->flight_cond);
++ vlc_mutex_unlock(&p->lock);
++}
++
++static void * cma_pool_fixed_get(cma_pool_fixed_t * const p, const size_t req_el_size, const bool inc_flight, const bool no_pool)
++{
++ void * v = NULL;
++
++ vlc_mutex_lock(&p->lock);
++
++ for (;;)
++ {
++ if (req_el_size != p->el_size)
++ {
++ void ** const deadpool = p->pool;
++ const size_t dead_size = p->el_size;
++ const unsigned int dead_n = p->pool_size;
++
++ p->pool = NULL;
++ p->n_in = 0;
++ p->n_out = 0;
++ p->el_size = req_el_size;
++
++ if (deadpool != NULL)
++ {
++ vlc_mutex_unlock(&p->lock);
++ // Do the free old op outside the mutex in case the free is slow
++ free_pool(p, deadpool, dead_n, dead_size);
++ vlc_mutex_lock(&p->lock);
++ continue;
++ }
++ }
++
++ // Late abort if flush or cancel so we can still kill the pool
++ if (req_el_size == 0 || p->cancel)
++ {
++ vlc_mutex_unlock(&p->lock);
++ return NULL;
++ }
++
++ if (p->pool != NULL && !no_pool)
++ {
++ v = p->pool[p->n_in];
++ if (v != NULL)
++ {
++ p->pool[p->n_in] = NULL;
++ p->n_in = inc_mod(p->n_in, p->pool_size);
++ break;
++ }
++ }
++
++ if (p->in_flight <= 0)
++ break;
++
++ vlc_cond_wait(&p->flight_cond, &p->lock);
++ }
++
++ if (inc_flight)
++ ++p->in_flight;
++
++ vlc_mutex_unlock(&p->lock);
++
++ if (v == NULL && req_el_size != 0)
++ v = p->el_alloc_fn(p->alloc_v, req_el_size);
++
++ // Tag ref
++ if (v != NULL)
++ cma_pool_fixed_ref(p);
++ // Remove flight if we set it and error
++ else if (inc_flight)
++ cma_pool_fixed_dec_in_flight(p);
++
++ return v;
++}
++
++static void cma_pool_fixed_put(cma_pool_fixed_t * const p, void * v, const size_t el_size, const bool was_in_flight)
++{
++ vlc_mutex_lock(&p->lock);
++
++ if (el_size == p->el_size && (p->pool == NULL || p->pool[p->n_out] == NULL))
++ {
++ if (p->pool == NULL)
++ p->pool = calloc(p->pool_size, sizeof(void*));
++
++ p->pool[p->n_out] = v;
++ p->n_out = inc_mod(p->n_out, p->pool_size);
++ v = NULL;
++ }
++
++ if (was_in_flight)
++ --p->in_flight;
++
++ vlc_mutex_unlock(&p->lock);
++
++ vlc_cond_signal(&p->flight_cond);
++
++ if (v != NULL)
++ p->el_free_fn(p->alloc_v, v, el_size);
++
++ cma_pool_fixed_unref(p);
++}
++
++static int cma_pool_fixed_resize(cma_pool_fixed_t * const p,
++ const unsigned int new_pool_size, const int new_flight_size)
++{
++ void ** dead_pool = NULL;
++ size_t dead_size = 0;
++ unsigned int dead_n = 0;
++
++ // This makes this non-reentrant but saves us a lot of time in the normal
++ // "nothing happens" case
++ if (p->pool_size == new_pool_size && p->flight_size == new_flight_size)
++ return 0;
++
++ vlc_mutex_lock(&p->lock);
++
++ if (p->pool != NULL && new_pool_size != p->pool_size)
++ {
++ void ** const new_pool = calloc(new_pool_size, sizeof(void*));
++ unsigned int d, s;
++ dead_pool = p->pool;
++ dead_size = p->el_size;
++ dead_n = p->pool_size;
++
++ if (new_pool == NULL)
++ {
++ vlc_mutex_unlock(&p->lock);
++ return -1;
++ }
++
++ for (d = 0, s = p->n_in; d != new_pool_size && (new_pool[d] = dead_pool[s]) != NULL; ++d, s = inc_mod(s, dead_n))
++ dead_pool[s] = NULL;
++
++ p->n_out = 0;
++ p->n_in = (d != new_pool_size) ? d : 0;
++ p->pool = new_pool;
++ }
++
++ p->pool_size = new_pool_size;
++ if (new_flight_size > p->flight_size)
++ vlc_cond_broadcast(&p->flight_cond); // Lock still active so nothing happens till we release it
++ p->in_flight += p->flight_size - new_flight_size;
++ p->flight_size = new_flight_size;
++
++ vlc_mutex_unlock(&p->lock);
++
++ free_pool(p, dead_pool, dead_n, dead_size);
++ return 0;
++}
++
++static int cma_pool_fixed_fill(cma_pool_fixed_t * const p, const size_t el_size)
++{
++ for (;;)
++ {
++ vlc_mutex_lock(&p->lock);
++ bool done = el_size == p->el_size && p->pool != NULL && p->pool[p->n_out] != NULL;
++ vlc_mutex_unlock(&p->lock);
++ if (done)
++ break;
++ void * buf = cma_pool_fixed_get(p, el_size, false, true);
++ if (buf == NULL)
++ return -ENOMEM;
++ cma_pool_fixed_put(p, buf, el_size, false);
++ }
++ return 0;
++}
++
++static void cma_pool_fixed_cancel(cma_pool_fixed_t * const p)
++{
++ vlc_mutex_lock(&p->lock);
++ p->cancel = true;
++ vlc_cond_broadcast(&p->flight_cond);
++ vlc_mutex_unlock(&p->lock);
++}
++
++static void cma_pool_fixed_uncancel(cma_pool_fixed_t * const p)
++{
++ vlc_mutex_lock(&p->lock);
++ p->cancel = false;
++ vlc_mutex_unlock(&p->lock);
++}
++
++
++// Purge pool & unref
++static void cma_pool_fixed_kill(cma_pool_fixed_t * const p)
++{
++ if (p == NULL)
++ return;
++
++ // This flush is not strictly needed but it reclaims what memory we can reclaim asap
++ cma_pool_fixed_get(p, 0, false, false);
++ cma_pool_fixed_unref(p);
++}
++
++// Create a new pool
++static cma_pool_fixed_t*
++cma_pool_fixed_new(const unsigned int pool_size,
++ const int flight_size,
++ void * const alloc_v,
++ cma_pool_alloc_fn * const alloc_fn, cma_pool_free_fn * const free_fn,
++ cma_pool_on_delete_fn * const on_delete_fn,
++ const char * const name)
++{
++ cma_pool_fixed_t* const p = calloc(1, sizeof(cma_pool_fixed_t));
++ if (p == NULL)
++ return NULL;
++
++ atomic_store(&p->ref_count, 1);
++ vlc_mutex_init(&p->lock);
++ vlc_cond_init(&p->flight_cond);
++
++ p->pool_size = pool_size;
++ p->flight_size = flight_size;
++ p->in_flight = -flight_size;
++
++ p->alloc_v = alloc_v;
++ p->el_alloc_fn = alloc_fn;
++ p->el_free_fn = free_fn;
++ p->on_delete_fn = on_delete_fn;
++ p->name = name == NULL ? NULL : strdup(name);
++#if TRACE_ALL
++ p->seq = atomic_fetch_add(&pool_seq, 1);
++#endif
++
++ return p;
++}
++
++// ---------------------------------------------------------------------------
++//
++// CMA buffer functions - uses cma_pool_fixed for pooling
++
++struct cma_buf_pool_s {
++ cma_pool_fixed_t * pool;
++ vcsm_init_type_t init_type;
++
++ bool all_in_flight;
++#if TRACE_ALL
++ size_t alloc_n;
++ size_t alloc_size;
++#endif
++};
++
++typedef struct cma_buf_s {
++ atomic_int ref_count;
++ cma_buf_pool_t * cbp;
++ bool in_flight;
++ size_t size;
++ unsigned int vcsm_h; // VCSM handle from initial alloc
++ unsigned int vc_h; // VC handle for ZC mmal buffers
++ unsigned int vc_addr; // VC addr - unused by us but wanted by FFmpeg
++ int fd; // dmabuf handle for GL
++ void * mmap; // ARM mapped address
++ picture_context_t *ctx2;
++} cma_buf_t;
++
++static void cma_pool_delete(cma_buf_t * const cb)
++{
++ assert(atomic_load(&cb->ref_count) == 0);
++#if TRACE_ALL
++ cb->cbp->alloc_size -= cb->size;
++ --cb->cbp->alloc_n;
++ fprintf(stderr, "%s[%d:%s]: N=%d, Total=%d\n", __func__, cb->cbp->pool->seq, cb->cbp->pool->name, cb->cbp->alloc_n, cb->cbp->alloc_size);
++#endif
++
++ if (cb->ctx2 != NULL)
++ cb->ctx2->destroy(cb->ctx2);
++
++ if (cb->mmap != MAP_FAILED)
++ {
++ if (cb->cbp->init_type == VCSM_INIT_CMA)
++ munmap(cb->mmap, cb->size);
++ else
++ vcsm_unlock_hdl(cb->vcsm_h);
++ }
++ if (cb->fd != -1)
++ close(cb->fd);
++ if (cb->vcsm_h != 0)
++ vcsm_free(cb->vcsm_h);
++ free(cb);
++}
++
++static void cma_pool_free_cb(void * v, void * el, size_t size)
++{
++ VLC_UNUSED(v);
++ VLC_UNUSED(size);
++
++ cma_pool_delete(el);
++}
++
++static void * cma_pool_alloc_cb(void * v, size_t size)
++{
++ cma_buf_pool_t * const cbp = v;
++
++ cma_buf_t * const cb = malloc(sizeof(cma_buf_t));
++ if (cb == NULL)
++ return NULL;
++
++ *cb = (cma_buf_t){
++ .ref_count = ATOMIC_VAR_INIT(0),
++ .cbp = cbp,
++ .in_flight = 0,
++ .size = size,
++ .vcsm_h = 0,
++ .vc_h = 0,
++ .fd = -1,
++ .mmap = MAP_FAILED,
++ .ctx2 = NULL
++ };
++#if TRACE_ALL
++ cb->cbp->alloc_size += cb->size;
++ ++cb->cbp->alloc_n;
++ fprintf(stderr, "%s[%d:%s]: N=%d, Total=%d\n", __func__, cbp->pool->seq, cbp->pool->name, cbp->alloc_n, cbp->alloc_size);
++#endif
++
++ // 0x80 is magic value to force full ARM-side mapping - otherwise
++ // cache requests can cause kernel crashes
++ if ((cb->vcsm_h = vcsm_malloc_cache(size, VCSM_CACHE_TYPE_HOST | 0x80, "VLC frame")) == 0)
++ {
++#if TRACE_ALL
++ fprintf(stderr, "vcsm_malloc_cache fail\n");
++#endif
++ goto fail;
++ }
++
++ if ((cb->vc_h = vcsm_vc_hdl_from_hdl(cb->vcsm_h)) == 0)
++ {
++#if TRACE_ALL
++ fprintf(stderr, "vcsm_vc_hdl_from_hdl fail\n");
++#endif
++ goto fail;
++ }
++
++ if (cbp->init_type == VCSM_INIT_CMA)
++ {
++ if ((cb->fd = vcsm_export_dmabuf(cb->vcsm_h)) == -1)
++ {
++#if TRACE_ALL
++ fprintf(stderr, "vcsm_export_dmabuf fail\n");
++#endif
++ goto fail;
++ }
++
++ if ((cb->mmap = mmap(NULL, cb->size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, cb->fd, 0)) == MAP_FAILED)
++ goto fail;
++ }
++ else
++ {
++ void * arm_addr;
++ if ((arm_addr = vcsm_lock(cb->vcsm_h)) == NULL)
++ {
++#if TRACE_ALL
++ fprintf(stderr, "vcsm_lock fail\n");
++#endif
++ goto fail;
++ }
++ cb->mmap = arm_addr;
++ }
++
++ cb->vc_addr = vcsm_vc_addr_from_hdl(cb->vcsm_h);
++
++ return cb;
++
++fail:
++ cma_pool_delete(cb);
++ return NULL;
++}
++
++// Pool has died - safe now to exit vcsm
++static void cma_buf_pool_on_delete_cb(void * v)
++{
++ cma_buf_pool_t * const cbp = v;
++
++ cma_vcsm_exit(cbp->init_type);
++ free(cbp);
++}
++
++void cma_buf_pool_cancel(cma_buf_pool_t * const cbp)
++{
++ if (cbp == NULL || cbp->pool == NULL)
++ return;
++
++ cma_pool_fixed_cancel(cbp->pool);
++}
++
++void cma_buf_pool_uncancel(cma_buf_pool_t * const cbp)
++{
++ if (cbp == NULL || cbp->pool == NULL)
++ return;
++
++ cma_pool_fixed_uncancel(cbp->pool);
++}
++
++// User finished with pool
++void cma_buf_pool_delete(cma_buf_pool_t * const cbp)
++{
++ if (cbp == NULL)
++ return;
++
++ if (cbp->pool != NULL)
++ {
++ // We will call cma_buf_pool_on_delete_cb when the pool finally dies
++ // (might be now) which will free up our env.
++ cma_pool_fixed_kill(cbp->pool);
++ }
++ else
++ {
++ // Had no pool for some reason (error) but must still finish cleanup
++ cma_buf_pool_on_delete_cb(cbp);
++ }
++}
++
++int cma_buf_pool_fill(cma_buf_pool_t * const cbp, const size_t el_size)
++{
++ return cma_pool_fixed_fill(cbp->pool, el_size);
++}
++
++int cma_buf_pool_resize(cma_buf_pool_t * const cbp,
++ const unsigned int new_pool_size, const int new_flight_size)
++{
++ return cma_pool_fixed_resize(cbp->pool, new_pool_size, new_flight_size);
++}
++
++cma_buf_pool_t * cma_buf_pool_new(const unsigned int pool_size, const unsigned int flight_size, const bool all_in_flight, const char * const name)
++{
++ vcsm_init_type_t const init_type = cma_vcsm_init();
++ if (init_type == VCSM_INIT_NONE)
++ return NULL;
++
++ cma_buf_pool_t * const cbp = calloc(1, sizeof(cma_buf_pool_t));
++ if (cbp == NULL)
++ return NULL;
++
++ cbp->init_type = init_type;
++ cbp->all_in_flight = all_in_flight;
++
++ if ((cbp->pool = cma_pool_fixed_new(pool_size, flight_size, cbp, cma_pool_alloc_cb, cma_pool_free_cb, cma_buf_pool_on_delete_cb, name)) == NULL)
++ goto fail;
++ return cbp;
++
++fail:
++ cma_buf_pool_delete(cbp);
++ return NULL;
++}
++
++
++void cma_buf_in_flight(cma_buf_t * const cb)
++{
++ if (!cb->cbp->all_in_flight)
++ {
++ assert(!cb->in_flight);
++ cb->in_flight = true;
++ cma_pool_fixed_inc_in_flight(cb->cbp->pool);
++ }
++}
++
++void cma_buf_end_flight(cma_buf_t * const cb)
++{
++ if (cb != NULL && !cb->cbp->all_in_flight && cb->in_flight)
++ {
++ cb->in_flight = false;
++ cma_pool_fixed_dec_in_flight(cb->cbp->pool);
++ }
++}
++
++
++// Return vcsm handle
++unsigned int cma_buf_vcsm_handle(const cma_buf_t * const cb)
++{
++ return cb->vcsm_h;
++}
++
++size_t cma_buf_size(const cma_buf_t * const cb)
++{
++ return cb->size;
++}
++
++int cma_buf_add_context2(cma_buf_t *const cb, picture_context_t * const ctx2)
++{
++ if (cb->ctx2 != NULL)
++ return VLC_EGENERIC;
++
++ cb->ctx2 = ctx2;
++ return VLC_SUCCESS;
++}
++
++unsigned int cma_buf_vc_handle(const cma_buf_t *const cb)
++{
++ return cb->vc_h;
++}
++
++int cma_buf_fd(const cma_buf_t *const cb)
++{
++ return cb->fd;
++}
++
++void * cma_buf_addr(const cma_buf_t *const cb)
++{
++ return cb->mmap;
++}
++
++unsigned int cma_buf_vc_addr(const cma_buf_t *const cb)
++{
++ return cb->vc_addr;
++}
++
++
++picture_context_t * cma_buf_context2(const cma_buf_t *const cb)
++{
++ return cb->ctx2;
++}
++
++
++void cma_buf_unref(cma_buf_t * const cb)
++{
++ if (cb == NULL)
++ return;
++ if (atomic_fetch_sub(&cb->ref_count, 1) <= 1)
++ {
++ const bool was_in_flight = cb->in_flight;
++ cb->in_flight = false;
++ cma_pool_fixed_put(cb->cbp->pool, cb, cb->size, was_in_flight);
++ }
++}
++
++cma_buf_t * cma_buf_ref(cma_buf_t * const cb)
++{
++ if (cb == NULL)
++ return NULL;
++ atomic_fetch_add(&cb->ref_count, 1);
++ return cb;
++}
++
++cma_buf_t * cma_buf_pool_alloc_buf(cma_buf_pool_t * const cbp, const size_t size)
++{
++ cma_buf_t *const cb = cma_pool_fixed_get(cbp->pool, size, cbp->all_in_flight, false);
++
++ if (cb == NULL)
++ return NULL;
++
++ cb->in_flight = cbp->all_in_flight;
++ // When 1st allocated or retrieved from the pool the block will have a
++ // ref count of 0 so ref here
++ return cma_buf_ref(cb);
++}
++
+--- /dev/null
++++ b/modules/hw/mmal/mmal_cma.h
+@@ -0,0 +1,71 @@
++#ifndef VLC_MMAL_MMAL_CMA_H_
++#define VLC_MMAL_MMAL_CMA_H_
++
++
++struct cma_pool_fixed_s;
++typedef struct cma_pool_fixed_s cma_pool_fixed_t;
++
++typedef void * cma_pool_alloc_fn(void * v, size_t size);
++typedef void cma_pool_free_fn(void * v, void * el, size_t size);
++typedef void cma_pool_on_delete_fn(void * v);
++
++#if 0
++void cma_pool_fixed_unref(cma_pool_fixed_t * const p);
++void cma_pool_fixed_ref(cma_pool_fixed_t * const p);
++void * cma_pool_fixed_get(cma_pool_fixed_t * const p, const size_t req_el_size, const bool in_flight);
++void cma_pool_fixed_put(cma_pool_fixed_t * const p, void * v, const size_t el_size, const bool was_in_flight);
++void cma_pool_fixed_inc_in_flight(cma_pool_fixed_t * const p);
++void cma_pool_fixed_dec_in_flight(cma_pool_fixed_t * const p);
++void cma_pool_fixed_cancel(cma_pool_fixed_t * const p);
++void cma_pool_fixed_uncancel(cma_pool_fixed_t * const p);
++void cma_pool_fixed_kill(cma_pool_fixed_t * const p);
++int cma_pool_fixed_resize(cma_pool_fixed_t * const p,
++ const unsigned int new_pool_size, const int new_flight_size);
++cma_pool_fixed_t * cma_pool_fixed_new(const unsigned int pool_size,
++ const int flight_size,
++ void * const alloc_v,
++ cma_pool_alloc_fn * const alloc_fn, cma_pool_free_fn * const free_fn,
++ cma_pool_on_delete_fn * const on_delete_fn,
++ const char * const name);
++#endif
++
++struct cma_buf_s;
++typedef struct cma_buf_s cma_buf_t;
++
++void cma_buf_in_flight(cma_buf_t * const cb);
++void cma_buf_end_flight(cma_buf_t * const cb);
++unsigned int cma_buf_vcsm_handle(const cma_buf_t * const cb);
++size_t cma_buf_size(const cma_buf_t * const cb);
++int cma_buf_add_context2(cma_buf_t *const cb, picture_context_t * const ctx2);
++unsigned int cma_buf_vc_handle(const cma_buf_t *const cb);
++int cma_buf_fd(const cma_buf_t *const cb);
++void * cma_buf_addr(const cma_buf_t *const cb);
++unsigned int cma_buf_vc_addr(const cma_buf_t *const cb);
++picture_context_t * cma_buf_context2(const cma_buf_t *const cb);
++
++void cma_buf_unref(cma_buf_t * const cb);
++cma_buf_t * cma_buf_ref(cma_buf_t * const cb);
++
++struct cma_buf_pool_s;
++typedef struct cma_buf_pool_s cma_buf_pool_t;
++
++cma_buf_t * cma_buf_pool_alloc_buf(cma_buf_pool_t * const p, const size_t size);
++void cma_buf_pool_cancel(cma_buf_pool_t * const cbp);
++void cma_buf_pool_uncancel(cma_buf_pool_t * const cbp);
++void cma_buf_pool_delete(cma_buf_pool_t * const p);
++int cma_buf_pool_fill(cma_buf_pool_t * const cbp, const size_t el_size);
++int cma_buf_pool_resize(cma_buf_pool_t * const cbp,
++ const unsigned int new_pool_size, const int new_flight_size);
++cma_buf_pool_t * cma_buf_pool_new(const unsigned int pool_size, const unsigned int flight_size,
++ const bool all_in_flight, const char * const name);
++
++static inline void cma_buf_pool_deletez(cma_buf_pool_t ** const pp)
++{
++ cma_buf_pool_t * const p = *pp;
++ if (p != NULL) {
++ *pp = NULL;
++ cma_buf_pool_delete(p);
++ }
++}
++
++#endif // VLC_MMAL_MMAL_CMA_H_
+--- /dev/null
++++ b/modules/hw/mmal/mmal_gl.h
+@@ -0,0 +1,45 @@
++// Trim this include list!
++
++#include <libdrm/drm.h>
++#include <libdrm/drm_mode.h>
++#include <libdrm/drm_fourcc.h>
++//#include <xf86drm.h>
++//#include <xf86drmMode.h>
++#include <X11/Xlib.h>
++#include <X11/Xutil.h>
++#include <X11/Xlib-xcb.h>
++#include <epoxy/gl.h>
++#include <epoxy/egl.h>
++#include <xcb/xcb.h>
++#include <xcb/dri3.h>
++
++struct mmal_gl_converter_s;
++
++typedef struct cma_buf_s {
++ struct mmal_gl_converter_s * sys;
++
++ size_t size;
++ __u32 h_dumb;
++ int fd;
++ unsigned int h_vcsm;
++ void * mapped_addr;
++ GLuint texture;
++} cma_buf_t;
++
++typedef struct cma_pic_sys_s {
++ cma_buf_t * cmabuf;
++} cma_pic_sys_t;
++
++static inline unsigned int
++hw_mmal_h_vcsm(const picture_t * const pic)
++{
++ const cma_pic_sys_t *const pic_sys = (cma_pic_sys_t *)pic->p_sys;
++
++ if (pic->format.i_chroma != VLC_CODEC_MMAL_GL_RGB32 ||
++ pic_sys == NULL || pic_sys->cmabuf == NULL) {
++ return 0;
++ }
++
++ return pic_sys->cmabuf->h_vcsm;
++}
++
+--- /dev/null
++++ b/modules/hw/mmal/mmal_piccpy_neon.S
+@@ -0,0 +1,105 @@
++// Copy pix
++
++ .syntax unified
++ .arm
++// .thumb
++ .text
++ .align 16
++ .arch armv7-a
++ .fpu neon-vfpv4
++
++
++.macro function name
++ .global \name
++#ifdef __ELF__
++ .type \name, %function
++#endif
++\name:
++.endm
++
++
++.macro piccpy_to_8, bit_depth
++ subs r2, #128
++ vpush {q4-q7}
++ blt 2f
++1:
++ vldm r1!, {q0-q7}
++ subs r2, #128
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vqrshrn.u16 d2, q2, #\bit_depth - 8
++ vqrshrn.u16 d3, q3, #\bit_depth - 8
++ vldm r1!, {q8-q15}
++ vqrshrn.u16 d4, q4, #\bit_depth - 8
++ vqrshrn.u16 d5, q5, #\bit_depth - 8
++ vqrshrn.u16 d6, q6, #\bit_depth - 8
++ vqrshrn.u16 d7, q7, #\bit_depth - 8
++ vqrshrn.u16 d8, q8, #\bit_depth - 8
++ vqrshrn.u16 d9, q9, #\bit_depth - 8
++ vqrshrn.u16 d10, q10, #\bit_depth - 8
++ vqrshrn.u16 d11, q11, #\bit_depth - 8
++ vqrshrn.u16 d12, q12, #\bit_depth - 8
++ vqrshrn.u16 d13, q13, #\bit_depth - 8
++ vqrshrn.u16 d14, q14, #\bit_depth - 8
++ vqrshrn.u16 d15, q15, #\bit_depth - 8
++ vstm r0!, {q0-q7}
++ bge 1b
++2:
++ adds r2, #64
++ blt 1f
++
++ vldm r1!, {q0-q7}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vqrshrn.u16 d2, q2, #\bit_depth - 8
++ vqrshrn.u16 d3, q3, #\bit_depth - 8
++ vqrshrn.u16 d4, q4, #\bit_depth - 8
++ vqrshrn.u16 d5, q5, #\bit_depth - 8
++ vqrshrn.u16 d6, q6, #\bit_depth - 8
++ vqrshrn.u16 d7, q7, #\bit_depth - 8
++ vstm r0!, {q0-q3}
++1:
++ adds r2, #32
++ blt 1f
++
++ vldm r1!, {q0-q3}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vqrshrn.u16 d2, q2, #\bit_depth - 8
++ vqrshrn.u16 d3, q3, #\bit_depth - 8
++ vstm r0!, {q0-q1}
++1:
++ adds r2, #16
++ blt 1f
++
++ vldm r1!, {q0-q1}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vstm r0!, {q0}
++1:
++ adds r2, #8
++ blt 1f
++
++ vldm r1!, {q0}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vstr d0, [r0]
++ add r0, #8
++1:
++ adds r2, #4
++ blt 1f
++
++ vldr d0, [r1]
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vstr s0, [r0]
++1:
++ vpop {q4-q7}
++ bx lr
++.endm
++
++
++@ [r0] Dest
++@ [r1] Src
++@ r2 Pels
++function mmal_piccpy_10_to_8_neon
++ piccpy_to_8 10
++
+--- a/modules/hw/mmal/mmal_picture.c
++++ b/modules/hw/mmal/mmal_picture.c
+@@ -21,25 +21,1542 @@
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
++// We would really like to use vlc_thread.h but the detach thread stuff can't be
++// used here :-(
++#include <pthread.h>
++
++#include <stdatomic.h>
++#include <unistd.h>
++#include <fcntl.h>
++
+ #include <vlc_common.h>
++#include <vlc_cpu.h>
+ #include <vlc_picture.h>
++
++#pragma GCC diagnostic push
++#pragma GCC diagnostic ignored "-Wbad-function-cast"
++#include <bcm_host.h>
++#pragma GCC diagnostic pop
+ #include <interface/mmal/mmal.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++#include <interface/vmcs_host/vcgencmd.h>
++#include <interface/vcsm/user-vcsm.h>
+
++#include "mmal_cma.h"
+ #include "mmal_picture.h"
++#include "transform_ops.h"
++
++#define TRACE_TRANSFORMS 0
++
++#define UINT64_SIZE(s) (((s) + sizeof(uint64_t) - 1)/sizeof(uint64_t))
++
++static inline char safe_char(const unsigned int c0)
++{
++ const unsigned int c = c0 & 0xff;
++ return c > ' ' && c < 0x7f ? c : '.';
++}
++
++const char * str_fourcc(char * const buf, const unsigned int fcc)
++{
++ if (fcc == 0)
++ return "----";
++ buf[0] = safe_char(fcc >> 0);
++ buf[1] = safe_char(fcc >> 8);
++ buf[2] = safe_char(fcc >> 16);
++ buf[3] = safe_char(fcc >> 24);
++ buf[4] = 0;
++ return buf;
++}
++
++// WB + Inv
++static inline void flush_range(void * const start, const size_t len)
++{
++ uint64_t buf[UINT64_SIZE(sizeof(struct vcsm_user_clean_invalid2_s) + sizeof(struct vcsm_user_clean_invalid2_block_s))];
++ struct vcsm_user_clean_invalid2_s * const b = (struct vcsm_user_clean_invalid2_s *)buf;
++
++ *b = (struct vcsm_user_clean_invalid2_s){
++ .op_count = 1
++ };
++
++ b->s[0] = (struct vcsm_user_clean_invalid2_block_s){
++ .invalidate_mode = 3, // wb + invalidate
++ .block_count = 1,
++ .start_address = start, // Rely on clean inv to fix up align & size boundries
++ .block_size = len,
++ .inter_block_stride = 0
++ };
++
++ vcsm_clean_invalid2(b);
++}
++
++MMAL_FOURCC_T vlc_to_mmal_color_space(const video_color_space_t vlc_cs)
++{
++ switch (vlc_cs)
++ {
++ case COLOR_SPACE_BT601:
++ return MMAL_COLOR_SPACE_ITUR_BT601;
++ case COLOR_SPACE_BT709:
++ return MMAL_COLOR_SPACE_ITUR_BT709;
++ default:
++ break;
++ }
++ return MMAL_COLOR_SPACE_UNKNOWN;
++}
++
++MMAL_FOURCC_T vlc_to_mmal_video_fourcc(const video_frame_format_t * const vf_vlc)
++{
++ switch (vf_vlc->i_chroma) {
++ case VLC_CODEC_MMAL_ZC_RGB32:
++ case VLC_CODEC_RGB32:
++ {
++ // VLC RGB32 aka RV32 means we have to look at the mask values
++ const uint32_t r = vf_vlc->i_rmask;
++ const uint32_t g = vf_vlc->i_gmask;
++ const uint32_t b = vf_vlc->i_bmask;
++ if (r == 0xff0000 && g == 0xff00 && b == 0xff)
++ return MMAL_ENCODING_BGRA;
++ if (r == 0xff && g == 0xff00 && b == 0xff0000)
++ return MMAL_ENCODING_RGBA;
++ if (r == 0xff000000 && g == 0xff0000 && b == 0xff00)
++ return MMAL_ENCODING_ABGR;
++ if (r == 0xff00 && g == 0xff0000 && b == 0xff000000)
++ return MMAL_ENCODING_ARGB;
++ break;
++ }
++ case VLC_CODEC_RGB16:
++ {
++ // VLC RGB16 aka RV16 means we have to look at the mask values
++ const uint32_t r = vf_vlc->i_rmask;
++ const uint32_t g = vf_vlc->i_gmask;
++ const uint32_t b = vf_vlc->i_bmask;
++ if (r == 0xf800 && g == 0x7e0 && b == 0x1f)
++ return MMAL_ENCODING_RGB16;
++ break;
++ }
++ case VLC_CODEC_I420:
++ case VLC_CODEC_MMAL_ZC_I420:
++ return MMAL_ENCODING_I420;
++ case VLC_CODEC_RGBA:
++ return MMAL_ENCODING_RGBA;
++ case VLC_CODEC_BGRA:
++ return MMAL_ENCODING_BGRA;
++ case VLC_CODEC_ARGB:
++ return MMAL_ENCODING_ARGB;
++ // VLC_CODEC_ABGR does not exist in VLC
++ case VLC_CODEC_MMAL_OPAQUE:
++ return MMAL_ENCODING_OPAQUE;
++ case VLC_CODEC_MMAL_ZC_SAND8:
++ return MMAL_ENCODING_YUVUV128;
++ case VLC_CODEC_MMAL_ZC_SAND10:
++ return MMAL_ENCODING_YUVUV64_10;
++ case VLC_CODEC_MMAL_ZC_SAND30:
++ return MMAL_ENCODING_YUV10_COL;
++ default:
++ break;
++ }
++ return 0;
++}
++
++static void vlc_fmt_to_video_format(MMAL_VIDEO_FORMAT_T *const vf_mmal, const video_frame_format_t * const vf_vlc)
++{
++ const unsigned int wmask = (vf_vlc->i_chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ vf_vlc->i_chroma == VLC_CODEC_I420) ? 31 : 15;
++
++ vf_mmal->width = (vf_vlc->i_width + wmask) & ~wmask;
++ vf_mmal->height = (vf_vlc->i_height + 15) & ~15;
++ vf_mmal->crop.x = vf_vlc->i_x_offset;
++ vf_mmal->crop.y = vf_vlc->i_y_offset;
++ vf_mmal->crop.width = vf_vlc->i_visible_width;
++ vf_mmal->crop.height = vf_vlc->i_visible_height;
++ if (vf_vlc->i_sar_num == 0 || vf_vlc->i_sar_den == 0) {
++ vf_mmal->par.num = 1;
++ vf_mmal->par.den = 1;
++ } else {
++ vf_mmal->par.num = vf_vlc->i_sar_num;
++ vf_mmal->par.den = vf_vlc->i_sar_den;
++ }
++ vf_mmal->frame_rate.num = vf_vlc->i_frame_rate;
++ vf_mmal->frame_rate.den = vf_vlc->i_frame_rate_base;
++ vf_mmal->color_space = vlc_to_mmal_color_space(vf_vlc->space);
++}
++
++
++void hw_mmal_vlc_fmt_to_mmal_fmt(MMAL_ES_FORMAT_T *const es_fmt, const video_frame_format_t * const vf_vlc)
++{
++ vlc_fmt_to_video_format(&es_fmt->es->video, vf_vlc);
++}
++
++bool hw_mmal_vlc_pic_to_mmal_fmt_update(MMAL_ES_FORMAT_T *const es_fmt, const picture_t * const pic)
++{
++ MMAL_VIDEO_FORMAT_T vf_new_ss;
++ MMAL_VIDEO_FORMAT_T *const vf_old = &es_fmt->es->video;
++ MMAL_VIDEO_FORMAT_T *const vf_new = &vf_new_ss;
++
++ vlc_fmt_to_video_format(vf_new, &pic->format);
++
++ // If we have a format that might have come from ffmpeg then rework for
++ // a better guess as to layout. All sand stuff is "special" with regards to
++ // width/height vs real layout so leave as is if that
++ if ((pic->format.i_chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ pic->format.i_chroma == VLC_CODEC_MMAL_ZC_RGB32) &&
++ pic->p[0].i_pixel_pitch != 0)
++ {
++ // Now overwrite width/height with a better guess as to actual layout info
++ vf_new->height = pic->p[0].i_lines;
++ vf_new->width = pic->p[0].i_pitch / pic->p[0].i_pixel_pitch;
++ }
++
++ if (
++ vf_new->width != vf_old->width ||
++ vf_new->height != vf_old->height ||
++ vf_new->crop.x != vf_old->crop.x ||
++ vf_new->crop.y != vf_old->crop.y ||
++ vf_new->crop.width != vf_old->crop.width ||
++ vf_new->crop.height != vf_old->crop.height ||
++ vf_new->par.num != vf_old->par.num ||
++ vf_new->par.den != vf_old->par.den ||
++ // Frame rate ignored
++ vf_new->color_space != vf_old->color_space)
++ {
++#if 0
++ char dbuf0[5], dbuf1[5];
++ printf("%dx%d (%d,%d %dx%d) par:%d/%d %s -> %dx%d (%d,%d %dx%d) par:%d/%d %s\n",
++ vf_old->width ,
++ vf_old->height ,
++ vf_old->crop.x ,
++ vf_old->crop.y ,
++ vf_old->crop.width ,
++ vf_old->crop.height ,
++ vf_old->par.num ,
++ vf_old->par.den ,
++ str_fourcc(dbuf0, vf_old->color_space) ,
++ vf_new->width ,
++ vf_new->height ,
++ vf_new->crop.x ,
++ vf_new->crop.y ,
++ vf_new->crop.width ,
++ vf_new->crop.height ,
++ vf_new->par.num ,
++ vf_new->par.den ,
++ str_fourcc(dbuf1, vf_new->color_space) );
++#endif
++ *vf_old = *vf_new;
++ return true;
++ }
++ return false;
++}
++
++
++hw_mmal_port_pool_ref_t * hw_mmal_port_pool_ref_create(MMAL_PORT_T * const port,
++ const unsigned int headers, const uint32_t payload_size)
++{
++ hw_mmal_port_pool_ref_t * ppr = calloc(1, sizeof(hw_mmal_port_pool_ref_t));
++ if (ppr == NULL)
++ return NULL;
++
++ if ((ppr->pool = mmal_port_pool_create(port, headers, payload_size)) == NULL)
++ goto fail;
++
++ ppr->port = port;
++ atomic_store(&ppr->refs, 1);
++ return ppr;
++
++fail:
++ free(ppr);
++ return NULL;
++}
++
++static void do_detached(void *(*fn)(void *), void * v)
++{
++ pthread_t dothread;
++ pthread_create(&dothread, NULL, fn, v);
++ pthread_detach(dothread);
++}
++
++// Destroy a ppr - aranged s.t. it has the correct prototype for a pthread
++static void * kill_ppr(void * v)
++{
++ hw_mmal_port_pool_ref_t * const ppr = v;
++ if (ppr->port->is_enabled)
++ mmal_port_disable(ppr->port); // Avoid annoyed messages from MMAL when we kill the pool
++ mmal_port_pool_destroy(ppr->port, ppr->pool);
++ free(ppr);
++ return NULL;
++}
++
++void hw_mmal_port_pool_ref_release(hw_mmal_port_pool_ref_t * const ppr, const bool in_cb)
++{
++ if (ppr == NULL)
++ return;
++ if (atomic_fetch_sub(&ppr->refs, 1) != 1)
++ return;
++ if (in_cb)
++ do_detached(kill_ppr, ppr);
++ else
++ kill_ppr(ppr);
++}
++
++// Put buffer in port if possible - if not then release to pool
++// Returns true if sent, false if recycled
++bool hw_mmal_port_pool_ref_recycle(hw_mmal_port_pool_ref_t * const ppr, MMAL_BUFFER_HEADER_T * const buf)
++{
++ mmal_buffer_header_reset(buf);
++ buf->user_data = NULL;
++
++ if (mmal_port_send_buffer(ppr->port, buf) == MMAL_SUCCESS)
++ return true;
++ mmal_buffer_header_release(buf);
++ return false;
++}
++
++MMAL_STATUS_T hw_mmal_port_pool_ref_fill(hw_mmal_port_pool_ref_t * const ppr)
++{
++ MMAL_BUFFER_HEADER_T * buf;
++ MMAL_STATUS_T err = MMAL_SUCCESS;
++
++ while ((buf = mmal_queue_get(ppr->pool->queue)) != NULL) {
++ if ((err = mmal_port_send_buffer(ppr->port, buf)) != MMAL_SUCCESS)
++ {
++ mmal_queue_put_back(ppr->pool->queue, buf);
++ break;
++ }
++ }
++ return err;
++}
++
++
++MMAL_STATUS_T hw_mmal_opaque_output(vlc_object_t * const obj,
++ hw_mmal_port_pool_ref_t ** pppr,
++ MMAL_PORT_T * const port,
++ const unsigned int extra_buffers, MMAL_PORT_BH_CB_T callback)
++{
++ MMAL_STATUS_T status;
++
++ port->userdata = (struct MMAL_PORT_USERDATA_T *)obj;
++
++ status = port_parameter_set_uint32(port, MMAL_PARAMETER_EXTRA_BUFFERS, extra_buffers);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(obj, "Failed to set MMAL_PARAMETER_EXTRA_BUFFERS on output port (status=%"PRIx32" %s)",
++ status, mmal_status_to_string(status));
++ return status;
++ }
++
++ status = port_parameter_set_bool(port, MMAL_PARAMETER_ZERO_COPY, 1);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(obj, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
++ port->name, status, mmal_status_to_string(status));
++ return status;
++ }
++
++ port->format->encoding = MMAL_ENCODING_OPAQUE;
++ port->format->encoding_variant = 0;
++ if ((status = mmal_port_format_commit(port)) != MMAL_SUCCESS)
++ {
++ msg_Err(obj, "Failed to commit format on port %s (status=%"PRIx32" %s)",
++ port->name, status, mmal_status_to_string(status));
++ return status;
++ }
++
++ port->buffer_num = 30;
++ port->buffer_size = port->buffer_size_recommended;
++
++ if ((*pppr = hw_mmal_port_pool_ref_create(port, port->buffer_num, port->buffer_size)) == NULL) {
++ msg_Err(obj, "Failed to create output pool");
++ return status;
++ }
++
++ status = mmal_port_enable(port, callback);
++ if (status != MMAL_SUCCESS) {
++ hw_mmal_port_pool_ref_release(*pppr, false);
++ *pppr = NULL;
++ msg_Err(obj, "Failed to enable output port %s (status=%"PRIx32" %s)",
++ port->name, status, mmal_status_to_string(status));
++ return status;
++ }
++
++ return MMAL_SUCCESS;
++}
++
++
++void hw_mmal_pic_ctx_destroy(picture_context_t * pic_ctx_cmn)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic_ctx_cmn;
++ unsigned int i;
++
++ for (i = 0; i != ctx->buf_count; ++i) {
++ if (ctx->bufs[i] != NULL)
++ mmal_buffer_header_release(ctx->bufs[i]);
++ }
++
++ cma_buf_end_flight(ctx->cb);
++ cma_buf_unref(ctx->cb);
++
++ free(ctx);
++}
++
++picture_context_t * hw_mmal_pic_ctx_copy(picture_context_t * pic_ctx_cmn)
++{
++ const pic_ctx_mmal_t * const src_ctx = (pic_ctx_mmal_t *)pic_ctx_cmn;
++ pic_ctx_mmal_t * const dst_ctx = calloc(1, sizeof(*dst_ctx));
++ unsigned int i;
++
++ if (dst_ctx == NULL)
++ return NULL;
++
++ // Copy
++ dst_ctx->cmn = src_ctx->cmn;
++
++ dst_ctx->cb = cma_buf_ref(src_ctx->cb);
++
++ dst_ctx->buf_count = src_ctx->buf_count;
++ for (i = 0; i != src_ctx->buf_count; ++i) {
++ dst_ctx->bufs[i] = src_ctx->bufs[i];
++ if (dst_ctx->bufs[i] != NULL)
++ mmal_buffer_header_acquire(dst_ctx->bufs[i]);
++ }
++
++ return &dst_ctx->cmn;
++}
++
++static MMAL_BOOL_T
++buf_pre_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
++{
++ hw_mmal_port_pool_ref_t * const ppr = userdata;
++
++ // Kill the callback - otherwise we will go in circles!
++ mmal_buffer_header_pre_release_cb_set(buf, (MMAL_BH_PRE_RELEASE_CB_T)0, NULL);
++ mmal_buffer_header_acquire(buf); // Ref it again
++
++ // As we have re-acquired the buffer we need a full release
++ // (not continue) to zap the ref count back to zero
++ // This is "safe" 'cos we have already reset the cb
++ hw_mmal_port_pool_ref_recycle(ppr, buf);
++ hw_mmal_port_pool_ref_release(ppr, true); // Assume in callback
++
++ return MMAL_TRUE;
++}
++
++// Buffer belongs to context on successful return from this fn
++// is still valid on failure
++picture_context_t *
++hw_mmal_gen_context(MMAL_BUFFER_HEADER_T * buf, hw_mmal_port_pool_ref_t * const ppr)
++{
++ pic_ctx_mmal_t * const ctx = calloc(1, sizeof(pic_ctx_mmal_t));
++
++ if (ctx == NULL)
++ return NULL;
++
++ // If we have an associated ppr then ref & set appropriate callbacks
++ if (ppr != NULL) {
++ hw_mmal_port_pool_ref_acquire(ppr);
++ mmal_buffer_header_pre_release_cb_set(buf, buf_pre_release_cb, ppr);
++ buf->user_data = NULL;
++ }
++
++ ctx->cmn.copy = hw_mmal_pic_ctx_copy;
++ ctx->cmn.destroy = hw_mmal_pic_ctx_destroy;
++
++ ctx->buf_count = 1;
++ ctx->bufs[0] = buf;
++
++ return &ctx->cmn;
++}
++
++// n is els
++// * Make NEON!
++typedef void piccpy_fn(void * dest, const void * src, size_t n);
++
++extern piccpy_fn mmal_piccpy_10_to_8_neon;
++
++static void piccpy_10_to_8_c(void * dest, const void * src, size_t n)
++{
++ uint8_t * d = dest;
++ const uint16_t * s = src;
++ while (n-- != 0)
++ *d++ = *s++ >> 2;
++}
++
++// Do a stride converting copy - if the strides are the same and line_len is
++// close then do a single block copy - we don't expect to have to preserve
++// pixels in the output frame
++static void mem_copy_2d(uint8_t * d_ptr, const size_t d_stride,
++ const uint8_t * s_ptr, const size_t s_stride,
++ size_t lines, const size_t line_len)
++{
++ if (s_stride == d_stride && d_stride < line_len + 32)
++ {
++ memcpy(d_ptr, s_ptr, d_stride * lines);
++ }
++ else
++ {
++ while (lines-- != 0) {
++ memcpy(d_ptr, s_ptr, line_len);
++ d_ptr += d_stride;
++ s_ptr += s_stride;
++ }
++ }
++}
++
++// line_len in D units
++static void mem_copy_2d_10_to_8(uint8_t * d_ptr, const size_t d_stride,
++ const uint8_t * s_ptr, const size_t s_stride,
++ size_t lines, const size_t line_len)
++{
++ piccpy_fn * const docpy = vlc_CPU_ARM_NEON() ? mmal_piccpy_10_to_8_neon : piccpy_10_to_8_c;
++ if (s_stride == d_stride * 2 && d_stride < line_len + 32)
++ {
++ docpy(d_ptr, s_ptr, d_stride * lines);
++ }
++ else
++ {
++ while (lines-- != 0) {
++ docpy(d_ptr, s_ptr, line_len);
++ d_ptr += d_stride;
++ s_ptr += s_stride;
++ }
++ }
++}
++
++
++int hw_mmal_copy_pic_to_buf(void * const buf_data,
++ uint32_t * const pLength,
++ const MMAL_ES_FORMAT_T * const fmt,
++ const picture_t * const pic)
++{
++ const MMAL_VIDEO_FORMAT_T *const video = &fmt->es->video;
++ uint8_t * const dest = buf_data;
++ size_t length = 0;
++
++ //**** Worry about x/y_offsets
++
++ assert(fmt->encoding == MMAL_ENCODING_I420);
++
++ switch (pic->format.i_chroma) {
++ case VLC_CODEC_I420:
++ {
++ const size_t y_size = video->width * video->height;
++ mem_copy_2d(dest, video->width,
++ pic->p[0].p_pixels, pic->p[0].i_pitch,
++ video->crop.height,
++ video->crop.width);
++
++ mem_copy_2d(dest + y_size, video->width / 2,
++ pic->p[1].p_pixels, pic->p[1].i_pitch,
++ video->crop.height / 2,
++ video->crop.width / 2);
++
++ mem_copy_2d(dest + y_size + y_size / 4, video->width / 2,
++ pic->p[2].p_pixels, pic->p[2].i_pitch,
++ video->crop.height / 2,
++ video->crop.width / 2);
++
++ // And make sure it is actually in memory
++ length = y_size + y_size / 2;
++ break;
++ }
++
++ case VLC_CODEC_I420_10L:
++ {
++ const size_t y_size = video->width * video->height;
++ mem_copy_2d_10_to_8(dest, video->width,
++ pic->p[0].p_pixels, pic->p[0].i_pitch,
++ video->crop.height,
++ video->crop.width);
++
++ mem_copy_2d_10_to_8(dest + y_size, video->width / 2,
++ pic->p[1].p_pixels, pic->p[1].i_pitch,
++ video->crop.height / 2,
++ video->crop.width / 2);
++
++ mem_copy_2d_10_to_8(dest + y_size + y_size / 4, video->width / 2,
++ pic->p[2].p_pixels, pic->p[2].i_pitch,
++ video->crop.height / 2,
++ video->crop.width / 2);
++
++ // And make sure it is actually in memory
++ length = y_size + y_size / 2;
++ break;
++ }
++
++ default:
++ if (pLength != NULL)
++ *pLength = 0;
++ return VLC_EBADVAR;
++ }
++
++ if (cma_vcsm_type() == VCSM_INIT_LEGACY) { // ** CMA is currently always uncached
++ flush_range(dest, length);
++ }
++
++ if (pLength != NULL)
++ *pLength = (uint32_t)length;
++
++ return VLC_SUCCESS;
++}
++
++
++static MMAL_BOOL_T rep_buf_free_cb(MMAL_BUFFER_HEADER_T *header, void *userdata)
++{
++ cma_buf_t * const cb = userdata;
++ VLC_UNUSED(header);
++
++ cma_buf_unref(cb);
++ return MMAL_FALSE;
++}
++
++static int cma_buf_buf_attach(MMAL_BUFFER_HEADER_T * const buf, cma_buf_t * const cb)
++{
++ // Just a CMA buffer - fill in new buffer
++ const uintptr_t vc_h = cma_buf_vc_handle(cb);
++ if (vc_h == 0)
++ return VLC_EGENERIC;
++
++ mmal_buffer_header_reset(buf);
++ buf->data = (uint8_t *)vc_h;
++ buf->alloc_size = cma_buf_size(cb);
++ buf->length = buf->alloc_size;
++ // Ensure cb remains valid for the duration of this buffer
++ mmal_buffer_header_pre_release_cb_set(buf, rep_buf_free_cb, cma_buf_ref(cb));
++ return VLC_SUCCESS;
++}
++
++MMAL_BUFFER_HEADER_T * hw_mmal_pic_buf_copied(const picture_t *const pic,
++ MMAL_POOL_T * const rep_pool,
++ MMAL_PORT_T * const port,
++ cma_buf_pool_t * const cbp)
++{
++ MMAL_BUFFER_HEADER_T *const buf = mmal_queue_wait(rep_pool->queue);
++ if (buf == NULL)
++ goto fail0;
++
++ cma_buf_t * const cb = cma_buf_pool_alloc_buf(cbp, port->buffer_size);
++ if (cb == NULL)
++ goto fail1;
++
++ if (cma_buf_buf_attach(buf, cb) != VLC_SUCCESS)
++ goto fail2;
++
++ pic_to_buf_copy_props(buf, pic);
++
++ if (hw_mmal_copy_pic_to_buf(cma_buf_addr(cb), &buf->length, port->format, pic) != VLC_SUCCESS)
++ goto fail2;
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++
++ cma_buf_unref(cb);
++ return buf;
++
++fail2:
++ cma_buf_unref(cb);
++fail1:
++ mmal_buffer_header_release(buf);
++fail0:
++ return NULL;
++}
++
++MMAL_BUFFER_HEADER_T * hw_mmal_pic_buf_replicated(const picture_t *const pic, MMAL_POOL_T * const rep_pool)
++{
++ pic_ctx_mmal_t *const ctx = (pic_ctx_mmal_t *)pic->context;
++ MMAL_BUFFER_HEADER_T *const rep_buf = mmal_queue_wait(rep_pool->queue);
++
++ if (rep_buf == NULL)
++ return NULL;
++
++ if (ctx->bufs[0] != NULL)
++ {
++ // Existing buffer - replicate it
++ if (mmal_buffer_header_replicate(rep_buf, ctx->bufs[0]) != MMAL_SUCCESS)
++ goto fail;
++ }
++ else if (ctx->cb != NULL)
++ {
++ // Just a CMA buffer - fill in new buffer
++ if (cma_buf_buf_attach(rep_buf, ctx->cb) != 0)
++ goto fail;
++ }
++ else
++ goto fail;
++
++ pic_to_buf_copy_props(rep_buf, pic);
++ return rep_buf;
++
++fail:
++ mmal_buffer_header_release(rep_buf);
++ return NULL;
++}
++
++
++
++
++int hw_mmal_get_gpu_mem(void) {
++ static int stashed_val = -2;
++ VCHI_INSTANCE_T vchi_instance;
++ VCHI_CONNECTION_T *vchi_connection = NULL;
++ char rbuf[1024] = { 0 };
++
++ if (stashed_val >= -1)
++ return stashed_val;
++
++ if (vchi_initialise(&vchi_instance) != 0)
++ goto fail0;
++
++ //create a vchi connection
++ if (vchi_connect(NULL, 0, vchi_instance) != 0)
++ goto fail0;
++
++ vc_vchi_gencmd_init(vchi_instance, &vchi_connection, 1);
++
++ //send the gencmd for the argument
++ if (vc_gencmd_send("get_mem gpu") != 0)
++ goto fail;
++
++ if (vc_gencmd_read_response(rbuf, sizeof(rbuf) - 1) != 0)
++ goto fail;
++
++ if (strncmp(rbuf, "gpu=", 4) != 0)
++ goto fail;
++
++ char *p;
++ unsigned long m = strtoul(rbuf + 4, &p, 10);
++
++ if (p[0] != 'M' || p[1] != '\0')
++ stashed_val = -1;
++ else
++ stashed_val = (int)m << 20;
++
++ vc_gencmd_stop();
++
++ //close the vchi connection
++ vchi_disconnect(vchi_instance);
++
++ return stashed_val;
++
++fail:
++ vc_gencmd_stop();
++ vchi_disconnect(vchi_instance);
++fail0:
++ stashed_val = -1;
++ return -1;
++};
++
++// ===========================================================================
++
++typedef struct pool_ent_s
++{
++ struct pool_ent_s * next;
++ struct pool_ent_s * prev;
++
++ atomic_int ref_count;
++ unsigned int seq;
++
++ size_t size;
++
++ int vcsm_hdl;
++ int vc_hdl;
++ void * buf;
++
++ unsigned int width;
++ unsigned int height;
++ MMAL_FOURCC_T enc_type;
++
++ picture_t * pic;
++} pool_ent_t;
++
++
++typedef struct ent_list_hdr_s
++{
++ pool_ent_t * ents;
++ pool_ent_t * tail;
++ unsigned int n;
++} ent_list_hdr_t;
++
++#define ENT_LIST_HDR_INIT (ent_list_hdr_t){ \
++ .ents = NULL, \
++ .tail = NULL, \
++ .n = 0 \
++}
++
++struct vzc_pool_ctl_s
++{
++ atomic_int ref_count;
++
++ ent_list_hdr_t ent_pool;
++ ent_list_hdr_t ents_cur;
++ ent_list_hdr_t ents_prev;
++
++ unsigned int max_n;
++ unsigned int seq;
++
++ vlc_mutex_t lock;
++
++ MMAL_POOL_T * buf_pool;
++
++ vcsm_init_type_t vcsm_init_type;
++};
++
++typedef struct vzc_subbuf_ent_s
++{
++ pool_ent_t * ent;
++ MMAL_RECT_T pic_rect;
++ MMAL_RECT_T orig_dest_rect;
++ MMAL_DISPLAYREGION_T dreg;
++} vzc_subbuf_ent_t;
++
++
++static pool_ent_t * ent_extract(ent_list_hdr_t * const elh, pool_ent_t * const ent)
++{
++// printf("List %p [%d]: Ext %p\n", elh, elh->n, ent);
++
++ if (ent == NULL)
++ return NULL;
++
++ if (ent->next == NULL)
++ elh->tail = ent->prev;
++ else
++ ent->next->prev = ent->prev;
++
++ if (ent->prev == NULL)
++ elh->ents = ent->next;
++ else
++ ent->prev->next = ent->next;
++
++ ent->prev = ent->next = NULL;
++
++ --elh->n;
++
++ return ent; // For convienience
++}
++
++static inline pool_ent_t * ent_extract_tail(ent_list_hdr_t * const elh)
++{
++ return ent_extract(elh, elh->tail);
++}
++
++static void ent_add_head(ent_list_hdr_t * const elh, pool_ent_t * const ent)
++{
++// printf("List %p [%d]: Add %p\n", elh, elh->n, ent);
++
++ if ((ent->next = elh->ents) == NULL)
++ elh->tail = ent;
++ else
++ ent->next->prev = ent;
++
++ ent->prev = NULL;
++ elh->ents = ent;
++ ++elh->n;
++}
++
++static void ent_free(pool_ent_t * const ent)
++{
++// printf("Free ent: %p\n", ent);
++ if (ent != NULL) {
++ // If we still have a ref to a pic - kill it now
++ if (ent->pic != NULL)
++ picture_Release(ent->pic);
++
++ // Free contents
++ vcsm_unlock_hdl(ent->vcsm_hdl);
++
++ vcsm_free(ent->vcsm_hdl);
++
++ free(ent);
++ }
++}
++
++static void ent_free_list(ent_list_hdr_t * const elh)
++{
++ pool_ent_t * ent = elh->ents;
++
++// printf("Free list: %p [%d]\n", elh, elh->n);
++
++ *elh = ENT_LIST_HDR_INIT;
++
++ while (ent != NULL) {
++ pool_ent_t * const t = ent;
++ ent = t->next;
++ ent_free(t);
++ }
++}
++
++static void ent_list_move(ent_list_hdr_t * const dst, ent_list_hdr_t * const src)
++{
++// printf("Move %p->%p\n", src, dst);
++
++ *dst = *src;
++ *src = ENT_LIST_HDR_INIT;
++}
++
++// Scans "backwards" as that should give us the fastest match if we are
++// presented with pics in the same order each time
++static pool_ent_t * ent_list_extract_pic_ent(ent_list_hdr_t * const elh, picture_t * const pic)
++{
++ pool_ent_t *ent = elh->tail;
++
++// printf("Find list: %p [%d]; pic:%p\n", elh, elh->n, pic);
++
++ while (ent != NULL) {
++// printf("Check ent: %p, pic:%p\n", ent, ent->pic);
++
++ if (ent->pic == pic)
++ return ent_extract(elh, ent);
++ ent = ent->prev;
++ }
++ return NULL;
++}
++
++#define POOL_ENT_ALLOC_BLOCK 0x10000
++
++static pool_ent_t * pool_ent_alloc_new(size_t req_size)
++{
++ pool_ent_t * ent = calloc(1, sizeof(*ent));
++ const size_t alloc_size = (req_size + POOL_ENT_ALLOC_BLOCK - 1) & ~(POOL_ENT_ALLOC_BLOCK - 1);
++
++ if (ent == NULL)
++ return NULL;
++
++ ent->next = ent->prev = NULL;
++
++ // Alloc from vcsm
++ if ((ent->vcsm_hdl = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST, (char *)"vlc-subpic")) == -1)
++ goto fail1;
++ if ((ent->vc_hdl = vcsm_vc_hdl_from_hdl(ent->vcsm_hdl)) == 0)
++ goto fail2;
++ if ((ent->buf = vcsm_lock(ent->vcsm_hdl)) == NULL)
++ goto fail2;
++
++ ent->size = alloc_size;
++ return ent;
++
++fail2:
++ vcsm_free(ent->vcsm_hdl);
++fail1:
++ free(ent);
++ return NULL;
++}
++
++static inline pool_ent_t * pool_ent_ref(pool_ent_t * const ent)
++{
++// int n = atomic_fetch_add(&ent->ref_count, 1) + 1;
++// printf("Ref: %p: %d\n", ent, n);
++ atomic_fetch_add(&ent->ref_count, 1);
++ return ent;
++}
++
++static void pool_recycle(vzc_pool_ctl_t * const pc, pool_ent_t * const ent)
++{
++ pool_ent_t * xs = NULL;
++ int n;
++
++ if (ent == NULL)
++ return;
++
++ n = atomic_fetch_sub(&ent->ref_count, 1) - 1;
++
++// printf("%s: Pool: %p: Ent: %p: %d\n", __func__, &pc->ent_pool, ent, n);
++
++ if (n != 0)
++ return;
++
++ if (ent->pic != NULL) {
++ picture_Release(ent->pic);
++ ent->pic = NULL;
++ }
++
++ vlc_mutex_lock(&pc->lock);
++
++ // If we have a full pool then extract the LRU and free it
++ // Free done outside mutex
++ if (pc->ent_pool.n >= pc->max_n)
++ xs = ent_extract_tail(&pc->ent_pool);
++
++ ent_add_head(&pc->ent_pool, ent);
++
++ vlc_mutex_unlock(&pc->lock);
++
++ ent_free(xs);
++}
++
++// * This could be made more efficient, but this is easy
++static void pool_recycle_list(vzc_pool_ctl_t * const pc, ent_list_hdr_t * const elh)
++{
++ pool_ent_t * ent;
++ while ((ent = ent_extract_tail(elh)) != NULL) {
++ pool_recycle(pc, ent);
++ }
++}
++
++static pool_ent_t * pool_best_fit(vzc_pool_ctl_t * const pc, size_t req_size)
++{
++ pool_ent_t * best = NULL;
++
++ vlc_mutex_lock(&pc->lock);
++
++ {
++ pool_ent_t * ent = pc->ent_pool.ents;
++
++ // Simple scan
++ while (ent != NULL) {
++ if (ent->size >= req_size && ent->size <= req_size * 2 + POOL_ENT_ALLOC_BLOCK &&
++ (best == NULL || best->size > ent->size))
++ best = ent;
++ ent = ent->next;
++ }
++
++ // extract best from chain if we've found it
++ ent_extract(&pc->ent_pool, best);
++ }
++
++ vlc_mutex_unlock(&pc->lock);
++
++ if (best == NULL)
++ best = pool_ent_alloc_new(req_size);
++
++ if ((best->seq = ++pc->seq) == 0)
++ best->seq = ++pc->seq; // Never allow to be zero
++
++ atomic_store(&best->ref_count, 1);
++ return best;
++}
++
++
++const vlc_fourcc_t hw_mmal_vzc_subpicture_chromas[] = { VLC_CODEC_RGBA, VLC_CODEC_BGRA, VLC_CODEC_ARGB, 0 };
++
++void hw_mmal_vzc_buf_get_wh(MMAL_BUFFER_HEADER_T * const buf, int * const pW, int * const pH)
++{
++ const pool_ent_t *const ent = ((vzc_subbuf_ent_t *)buf->user_data)->ent;
++ *pW = ent->width;
++ *pH = ent->height;
++}
++
++bool hw_mmal_vzc_buf_set_format(MMAL_BUFFER_HEADER_T * const buf, MMAL_ES_FORMAT_T * const es_fmt)
++{
++ const pool_ent_t *const ent = ((vzc_subbuf_ent_t *)buf->user_data)->ent;
++ MMAL_VIDEO_FORMAT_T * const v_fmt = &es_fmt->es->video;
++
++ es_fmt->type = MMAL_ES_TYPE_VIDEO;
++ es_fmt->encoding = ent->enc_type;
++ es_fmt->encoding_variant = 0;
++
++ v_fmt->width = ent->width;
++ v_fmt->height = ent->height;
++ v_fmt->crop.x = 0;
++ v_fmt->crop.y = 0;
++ v_fmt->crop.width = ent->width;
++ v_fmt->crop.height = ent->height;
++
++ return true;
++}
++
++void hw_mmal_vzc_buf_frame_size(MMAL_BUFFER_HEADER_T * const buf,
++ uint32_t * const pWidth, uint32_t * const pHeight)
++{
++ const pool_ent_t *const ent = ((vzc_subbuf_ent_t *)buf->user_data)->ent;
++ *pWidth = ent->width;
++ *pHeight = ent->height;
++}
++
++
++MMAL_DISPLAYREGION_T * hw_mmal_vzc_buf_region(MMAL_BUFFER_HEADER_T * const buf)
++{
++ vzc_subbuf_ent_t * sb = buf->user_data;
++ return &sb->dreg;
++}
++
++static inline int rescale_x(int x, int mul, int div)
++{
++ return div == 0 ? x * mul : (x * mul + div/2) / div;
++}
++
++static void rescale_rect(MMAL_RECT_T * const d, const MMAL_RECT_T * const s, const MMAL_RECT_T * mul_rect, const MMAL_RECT_T * div_rect)
++{
++ d->x = rescale_x(s->x - div_rect->x, mul_rect->width, div_rect->width) + mul_rect->x;
++ d->y = rescale_x(s->y - div_rect->y, mul_rect->height, div_rect->height) + mul_rect->y;
++ d->width = rescale_x(s->width, mul_rect->width, div_rect->width);
++ d->height = rescale_x(s->height, mul_rect->height, div_rect->height);
++#if TRACE_TRANSFORMS
++ fprintf(stderr, "(%d,%d %dx%d) * (%d,%d %dx%d) / (%d,%d %dx%d) -> (%d,%d %dx%d)\n",
++ s->x, s->y, s->width, s->height,
++ mul_rect->x, mul_rect->y, mul_rect->width, mul_rect->height,
++ div_rect->x, div_rect->y, div_rect->width, div_rect->height,
++ d->x, d->y, d->width, d->height);
++#endif
++}
++
++static MMAL_RECT_T
++rect_untransform(MMAL_RECT_T s, const MMAL_RECT_T c, const MMAL_DISPLAYTRANSFORM_T t)
++{
++#if TRACE_TRANSFORMS
++ fprintf(stderr, "t=%d, s=%d,%d:%dx%d, c=%d,%d:%dx%d -> ", (int)t,
++ s.x,s.y,s.width,s.height,
++ c.x,c.y,c.width,c.height);
++#endif
++ if (is_transform_hflip(t))
++ s = rect_hflip(s, c);
++ if (is_transform_vflip(t) != 0)
++ s = rect_vflip(s, c);
++ if (is_transform_transpose(t) != 0)
++ s = rect_transpose(s);
++#if TRACE_TRANSFORMS
++ fprintf(stderr, "s=%d,%d:%dx%d\n",
++ s.x,s.y,s.width,s.height);
++#endif
++ return s;
++}
++
++void hw_mmal_vzc_buf_scale_dest_rect(MMAL_BUFFER_HEADER_T * const buf, const MMAL_RECT_T * const scale_rect, const MMAL_DISPLAYTRANSFORM_T scale_transform)
++{
++ vzc_subbuf_ent_t * sb = buf->user_data;
++ if (scale_rect == NULL) {
++ sb->dreg.dest_rect = sb->orig_dest_rect;
++ sb->dreg.transform = MMAL_DISPLAY_ROT0;
++ }
++ else
++ {
++ // The scale rect has been transposed if we have a transposing
++ // transform - untranspose so we are the same way up as the source
++ const MMAL_RECT_T c = (scale_transform & 4) == 0 ? *scale_rect : rect_transpose(*scale_rect);
++ rescale_rect(&sb->dreg.dest_rect, &sb->orig_dest_rect,
++ &c, &sb->pic_rect);
++ sb->dreg.dest_rect = rect_untransform(sb->dreg.dest_rect, c, scale_transform);
++ sb->dreg.transform = scale_transform;
++ }
++}
++
++unsigned int hw_mmal_vzc_buf_seq(MMAL_BUFFER_HEADER_T * const buf)
++{
++ vzc_subbuf_ent_t * sb = buf->user_data;
++ return sb->ent->seq;
++}
++
++
++// The intent with the ents_cur & ents_last stuff is to remember the buffers
++// we used on the last frame and reuse them on the current one if they are the
++// same. Unfortunately detection of "is_first" is only a heuristic (there are
++// no rules governing the order in which things are blended) so we must deal
++// (fairly) gracefully with it never (or always) being set.
++
++// dst_fmt gives the number space in which the destination pixels are specified
++
++MMAL_BUFFER_HEADER_T * hw_mmal_vzc_buf_from_pic(vzc_pool_ctl_t * const pc,
++ picture_t * const pic,
++ const MMAL_RECT_T dst_pic_rect,
++ const int x_offset, const int y_offset,
++ const unsigned int alpha,
++ const bool is_first)
++{
++ MMAL_BUFFER_HEADER_T * const buf = mmal_queue_get(pc->buf_pool->queue);
++ vzc_subbuf_ent_t * sb;
++
++ if (buf == NULL)
++ return NULL;
++
++ if ((sb = calloc(1, sizeof(*sb))) == NULL)
++ goto fail1;
++
++ // If first or we've had a lot of stuff move everything to the last list
++ // (we could deal more gracefully with the "too many" case but it shouldn't
++ // really happen)
++ if (is_first || pc->ents_cur.n >= CTX_BUFS_MAX) {
++ pool_recycle_list(pc, &pc->ents_prev);
++ ent_list_move(&pc->ents_prev, &pc->ents_cur);
++ }
++
++ sb->dreg.hdr.id = MMAL_PARAMETER_DISPLAYREGION;
++ sb->dreg.hdr.size = sizeof(sb->dreg);
++ buf->user_data = sb;
++
++ {
++ // ?? Round start offset as well as length
++ const video_format_t *const fmt = &pic->format;
++
++ const unsigned int bpp = (fmt->i_bits_per_pixel + 7) >> 3;
++ const unsigned int xl = (fmt->i_x_offset & ~15);
++ const unsigned int xr = (fmt->i_x_offset + fmt->i_visible_width + 15) & ~15;
++ const size_t dst_stride = (xr - xl) * bpp;
++ const size_t dst_lines = ((fmt->i_visible_height + 15) & ~15);
++ const size_t dst_size = dst_stride * dst_lines;
++
++ pool_ent_t * ent = ent_list_extract_pic_ent(&pc->ents_prev, pic);
++ bool needs_copy = false;
++
++ // If we didn't find ent in last then look in cur in case is_first
++ // isn't working
++ if (ent == NULL)
++ ent = ent_list_extract_pic_ent(&pc->ents_cur, pic);
++
++// printf("ent_found: %p\n", ent);
+
+-int mmal_picture_lock(picture_t *picture)
++ if (ent == NULL)
++ {
++ // Need a new ent
++ needs_copy = true;
++
++ if ((ent = pool_best_fit(pc, dst_size)) == NULL)
++ goto fail2;
++ if ((ent->enc_type = vlc_to_mmal_video_fourcc(&pic->format)) == 0)
++ goto fail2;
++
++ ent->pic = picture_Hold(pic);
++ }
++
++ ent_add_head(&pc->ents_cur, ent);
++
++ sb->ent = pool_ent_ref(ent);
++ hw_mmal_vzc_pool_ref(pc);
++
++ // Copy data
++ buf->next = NULL;
++ buf->cmd = 0;
++ buf->data = (uint8_t *)(ent->vc_hdl);
++ buf->alloc_size = buf->length = dst_size;
++ buf->offset = 0;
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++ buf->pts = buf->dts = pic->date != VLC_TICK_INVALID ? pic->date : MMAL_TIME_UNKNOWN;
++ buf->type->video = (MMAL_BUFFER_HEADER_VIDEO_SPECIFIC_T){
++ .planes = 1,
++ .pitch = { dst_stride }
++ };
++
++ // Remember offsets
++ sb->dreg.set = MMAL_DISPLAY_SET_SRC_RECT |
++ MMAL_DISPLAY_SET_DEST_RECT |
++ MMAL_DISPLAY_SET_FULLSCREEN |
++ MMAL_DISPLAY_SET_TRANSFORM |
++ MMAL_DISPLAY_SET_ALPHA;
++
++ sb->dreg.fullscreen = 0;
++
++ // Will be set later - zero now to avoid any confusion
++ sb->dreg.transform = MMAL_DISPLAY_ROT0;
++ sb->dreg.dest_rect = (MMAL_RECT_T){0, 0, 0, 0};
++
++ sb->dreg.alpha = (uint32_t)(alpha & 0xff) | MMAL_DISPLAY_ALPHA_FLAGS_MIX;
++
++// printf("+++ bpp:%d, vis:%dx%d wxh:%dx%d, d:%dx%d\n", bpp, fmt->i_visible_width, fmt->i_visible_height, fmt->i_width, fmt->i_height, dst_stride, dst_lines);
++
++ sb->dreg.src_rect = (MMAL_RECT_T){
++ .x = (fmt->i_x_offset - xl),
++ .y = 0,
++ .width = fmt->i_visible_width,
++ .height = fmt->i_visible_height
++ };
++
++ sb->pic_rect = dst_pic_rect;
++
++ sb->orig_dest_rect = (MMAL_RECT_T){
++ .x = x_offset,
++ .y = y_offset,
++ .width = fmt->i_visible_width,
++ .height = fmt->i_visible_height
++ };
++
++ if (needs_copy)
++ {
++ ent->width = dst_stride / bpp;
++ ent->height = dst_lines;
++
++ // 2D copy
++ {
++ uint8_t *d = ent->buf;
++ const uint8_t *s = pic->p[0].p_pixels + xl * bpp + fmt->i_y_offset * pic->p[0].i_pitch;
++
++ mem_copy_2d(d, dst_stride, s, pic->p[0].i_pitch, fmt->i_visible_height, dst_stride);
++
++ // And make sure it is actually in memory
++ if (pc->vcsm_init_type != VCSM_INIT_CMA) { // ** CMA is currently always uncached
++ flush_range(ent->buf, dst_stride * fmt->i_visible_height);
++ }
++ }
++ }
++ }
++
++ return buf;
++
++fail2:
++ free(sb);
++fail1:
++ mmal_buffer_header_release(buf);
++ return NULL;
++}
++
++void hw_mmal_vzc_pool_flush(vzc_pool_ctl_t * const pc)
++{
++ pool_recycle_list(pc, &pc->ents_prev);
++ pool_recycle_list(pc, &pc->ents_cur);
++}
++
++static void hw_mmal_vzc_pool_delete(vzc_pool_ctl_t * const pc)
++{
++
++// printf("<<< %s\n", __func__);
++
++ hw_mmal_vzc_pool_flush(pc);
++
++ ent_free_list(&pc->ent_pool);
++
++ if (pc->buf_pool != NULL)
++ mmal_pool_destroy(pc->buf_pool);
++
++ vlc_mutex_destroy(&pc->lock);
++
++ cma_vcsm_exit(pc->vcsm_init_type);
++
++// memset(pc, 0xba, sizeof(*pc)); // Zap for (hopefully) faster crash
++ free (pc);
++
++ // printf(">>> %s\n", __func__);
++}
++
++void hw_mmal_vzc_pool_release(vzc_pool_ctl_t * const pc)
++{
++ int n;
++
++ if (pc == NULL)
++ return;
++
++ n = atomic_fetch_sub(&pc->ref_count, 1) - 1;
++
++ if (n != 0)
++ return;
++
++ hw_mmal_vzc_pool_delete(pc);
++}
++
++void hw_mmal_vzc_pool_ref(vzc_pool_ctl_t * const pc)
++{
++ atomic_fetch_add(&pc->ref_count, 1);
++}
++
++static MMAL_BOOL_T vcz_pool_release_cb(MMAL_POOL_T * buf_pool, MMAL_BUFFER_HEADER_T *buf, void *userdata)
++{
++ vzc_pool_ctl_t * const pc = userdata;
++ vzc_subbuf_ent_t * const sb = buf->user_data;
++
++ VLC_UNUSED(buf_pool);
++
++// printf("<<< %s\n", __func__);
++
++ if (sb != NULL) {
++ buf->user_data = NULL;
++ pool_recycle(pc, sb->ent);
++ hw_mmal_vzc_pool_release(pc);
++ free(sb);
++ }
++
++// printf(">>> %s\n", __func__);
++
++ return MMAL_TRUE;
++}
++
++vzc_pool_ctl_t * hw_mmal_vzc_pool_new()
++{
++ vzc_pool_ctl_t * const pc = calloc(1, sizeof(*pc));
++
++ if (pc == NULL)
++ return NULL;
++
++ if ((pc->vcsm_init_type = cma_vcsm_init()) == VCSM_INIT_NONE)
++ {
++ free(pc);
++ return NULL;
++ }
++
++ pc->max_n = 8;
++ vlc_mutex_init(&pc->lock); // Must init before potential destruction
++
++ if ((pc->buf_pool = mmal_pool_create(64, 0)) == NULL)
++ {
++ hw_mmal_vzc_pool_delete(pc);
++ return NULL;
++ }
++
++ atomic_store(&pc->ref_count, 1);
++
++ mmal_pool_callback_set(pc->buf_pool, vcz_pool_release_cb, pc);
++
++ return pc;
++}
++
++//----------------------------------------------------------------------------
++
++
++static const uint8_t shift_00[] = {0,0,0,0};
++static const uint8_t shift_01[] = {0,1,1,1};
++
++int cma_pic_set_data(picture_t * const pic,
++ const MMAL_ES_FORMAT_T * const mm_esfmt,
++ const MMAL_BUFFER_HEADER_T * const buf)
+ {
+- picture_sys_t *pic_sys = picture->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer = pic_sys->buffer;
++ const MMAL_VIDEO_FORMAT_T * const mm_fmt = &mm_esfmt->es->video;
++ const MMAL_BUFFER_HEADER_VIDEO_SPECIFIC_T *const buf_vid = (buf == NULL) ? NULL : &buf->type->video;
++ cma_buf_t *const cb = cma_buf_pic_get(pic);
++ unsigned int planes = 1;
++
++ uint8_t * const data = cma_buf_addr(cb);
++ if (data == NULL) {
++ return VLC_ENOMEM;
++ }
++
++ const uint8_t * ws = shift_00;
++ const uint8_t * hs = shift_00;
++ int pb = 1;
++
++ switch (mm_esfmt->encoding)
++ {
++ case MMAL_ENCODING_ARGB:
++ case MMAL_ENCODING_ABGR:
++ case MMAL_ENCODING_RGBA:
++ case MMAL_ENCODING_BGRA:
++ case MMAL_ENCODING_RGB32:
++ case MMAL_ENCODING_BGR32:
++ pb = 4;
++ break;
++ case MMAL_ENCODING_RGB16:
++ pb = 2;
++ break;
+
+- int offset = 0;
+- picture->p[0].p_pixels = buffer->data;
+- for (int i = 1; i < picture->i_planes; i++) {
+- offset = offset + picture->p[i - 1].i_pitch * picture->p[i - 1].i_lines;
+- picture->p[i].p_pixels = (ptrdiff_t)buffer->data + offset;
++ case MMAL_ENCODING_I420:
++ ws = shift_01;
++ hs = shift_01;
++ planes = 3;
++ break;
++
++ case MMAL_ENCODING_YUVUV128:
++ hs = shift_01;
++ planes = 2;
++ break;
++
++ default:
++// msg_Err(p_filter, "%s: Unexpected format", __func__);
++ return VLC_EGENERIC;
+ }
+
+- pic_sys->displayed = false;
++ // Fix up SAR if unset
++ if (pic->format.i_sar_den == 0 || pic->format.i_sar_num == 0) {
++ pic->format.i_sar_den = mm_fmt->par.den;
++ pic->format.i_sar_num = mm_fmt->par.num;
++ }
+
++ pic->i_planes = planes;
++ unsigned int offset = 0;
++ for (unsigned int i = 0; i != planes; ++i) {
++ pic->p[i] = (plane_t){
++ .p_pixels = data + (buf_vid != NULL ? buf_vid->offset[i] : offset),
++ .i_lines = mm_fmt->height >> hs[i],
++ .i_pitch = buf_vid != NULL ? buf_vid->pitch[i] : mm_fmt->width * pb,
++ .i_pixel_pitch = pb,
++ .i_visible_lines = mm_fmt->crop.height >> hs[i],
++ .i_visible_pitch = mm_fmt->crop.width >> ws[i]
++ };
++ offset += pic->p[i].i_pitch * pic->p[i].i_lines;
++ }
+ return VLC_SUCCESS;
+ }
++
++int cma_buf_pic_attach(cma_buf_t * const cb, picture_t * const pic)
++{
++ if (!is_cma_buf_pic_chroma(pic->format.i_chroma))
++ return VLC_EGENERIC;
++ if (pic->context != NULL)
++ return VLC_EBADVAR;
++
++ pic_ctx_mmal_t * const ctx = calloc(1, sizeof(pic_ctx_mmal_t));
++
++ if (ctx == NULL)
++ return VLC_ENOMEM;
++
++ ctx->cmn.copy = hw_mmal_pic_ctx_copy;
++ ctx->cmn.destroy = hw_mmal_pic_ctx_destroy;
++ ctx->buf_count = 1; // cb takes the place of the 1st buf
++ ctx->cb = cb;
++
++ cma_buf_in_flight(cb);
++
++ pic->context = &ctx->cmn;
++ return VLC_SUCCESS;
++}
++
++cma_buf_t * cma_buf_pic_get(picture_t * const pic)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic->context;
++ return !is_cma_buf_pic_chroma(pic->format.i_chroma) || ctx == NULL ? 0 : ctx->cb;
++}
++
++
++//----------------------------------------------------------------------------
++
++/* Returns the type of the Pi being used
++*/
++bool rpi_is_model_pi4(void) {
++ return bcm_host_is_model_pi4();
++}
++
++// Preferred mode - none->cma on Pi4 otherwise legacy
++static volatile vcsm_init_type_t last_vcsm_type = VCSM_INIT_NONE;
++
++vcsm_init_type_t cma_vcsm_type(void)
++{
++ return last_vcsm_type;
++}
++
++vcsm_init_type_t cma_vcsm_init(void)
++{
++ vcsm_init_type_t rv = VCSM_INIT_NONE;
++ // We don't bother locking - taking a copy here should be good enough
++ vcsm_init_type_t try_type = last_vcsm_type;
++
++ if (try_type == VCSM_INIT_NONE) {
++ if (bcm_host_is_fkms_active())
++ try_type = VCSM_INIT_CMA;
++ else
++ try_type = VCSM_INIT_LEGACY;
++ }
++
++ if (try_type == VCSM_INIT_CMA) {
++ if (vcsm_init_ex(1, -1) == 0)
++ rv = VCSM_INIT_CMA;
++ else if (vcsm_init_ex(0, -1) == 0)
++ rv = VCSM_INIT_LEGACY;
++ }
++ else
++ {
++ if (vcsm_init_ex(0, -1) == 0)
++ rv = VCSM_INIT_LEGACY;
++ else if (vcsm_init_ex(1, -1) == 0)
++ rv = VCSM_INIT_CMA;
++ }
++
++ // Just in case this affects vcsm init do after that
++ if (rv != VCSM_INIT_NONE)
++ bcm_host_init();
++
++ last_vcsm_type = rv;
++ return rv;
++}
++
++void cma_vcsm_exit(const vcsm_init_type_t init_mode)
++{
++ if (init_mode != VCSM_INIT_NONE)
++ {
++ vcsm_exit();
++ bcm_host_deinit(); // Does nothing but add in case it ever does
++ }
++}
++
++const char * cma_vcsm_init_str(const vcsm_init_type_t init_mode)
++{
++ switch (init_mode)
++ {
++ case VCSM_INIT_CMA:
++ return "CMA";
++ case VCSM_INIT_LEGACY:
++ return "Legacy";
++ case VCSM_INIT_NONE:
++ return "none";
++ default:
++ break;
++ }
++ return "???";
++}
++
++
+--- a/modules/hw/mmal/mmal_picture.h
++++ b/modules/hw/mmal/mmal_picture.h
+@@ -24,19 +24,298 @@
+ #ifndef VLC_MMAL_MMAL_PICTURE_H_
+ #define VLC_MMAL_MMAL_PICTURE_H_
+
++#include <stdatomic.h>
++
+ #include <vlc_common.h>
+ #include <interface/mmal/mmal.h>
+
++#include "mmal_cma.h"
++
+ /* Think twice before changing this. Incorrect values cause havoc. */
+ #define NUM_ACTUAL_OPAQUE_BUFFERS 30
+
+-struct picture_sys_t {
+- vlc_object_t *owner;
++#ifndef VLC_TICK_INVALID
++#define VLC_TICK_INVALID VLC_TS_INVALID
++#define VLC_VER_3 1
++#else
++#define VLC_VER_3 0
++#endif
++
++typedef struct mmal_port_pool_ref_s
++{
++ atomic_uint refs;
++ MMAL_POOL_T * pool;
++ MMAL_PORT_T * port;
++} hw_mmal_port_pool_ref_t;
++
++typedef struct pic_ctx_subpic_s {
++ picture_t * subpic;
++ int x, y;
++ int alpha;
++} pic_ctx_subpic_t;
++
++
++#define CTX_BUFS_MAX 4
++typedef struct pic_ctx_mmal_s {
++ picture_context_t cmn; // PARENT: Common els at start
++
++ cma_buf_t * cb;
++
++ unsigned int buf_count;
++ MMAL_BUFFER_HEADER_T * bufs[CTX_BUFS_MAX];
++
++} pic_ctx_mmal_t;
++
++const char * str_fourcc(char * const buf, const unsigned int fcc);
++
++MMAL_FOURCC_T vlc_to_mmal_video_fourcc(const video_frame_format_t * const vf_vlc);
++MMAL_FOURCC_T vlc_to_mmal_color_space(const video_color_space_t vlc_cs);
++void hw_mmal_vlc_fmt_to_mmal_fmt(MMAL_ES_FORMAT_T *const es_fmt, const video_frame_format_t * const vf_vlc);
++// Returns true if fmt_changed
++// frame_rate ignored for compare, but is set if something else is updated
++bool hw_mmal_vlc_pic_to_mmal_fmt_update(MMAL_ES_FORMAT_T *const es_fmt, const picture_t * const pic);
++
++// Copy pic contents into an existing buffer
++int hw_mmal_copy_pic_to_buf(void * const buf_data, uint32_t * const pLength,
++ const MMAL_ES_FORMAT_T * const fmt, const picture_t * const pic);
++
++hw_mmal_port_pool_ref_t * hw_mmal_port_pool_ref_create(MMAL_PORT_T * const port,
++ const unsigned int headers, const uint32_t payload_size);
++void hw_mmal_port_pool_ref_release(hw_mmal_port_pool_ref_t * const ppr, const bool in_cb);
++bool hw_mmal_port_pool_ref_recycle(hw_mmal_port_pool_ref_t * const ppr, MMAL_BUFFER_HEADER_T * const buf);
++MMAL_STATUS_T hw_mmal_port_pool_ref_fill(hw_mmal_port_pool_ref_t * const ppr);
++static inline void hw_mmal_port_pool_ref_acquire(hw_mmal_port_pool_ref_t * const ppr)
++{
++ atomic_fetch_add(&ppr->refs, 1);
++}
++MMAL_STATUS_T hw_mmal_opaque_output(vlc_object_t * const obj,
++ hw_mmal_port_pool_ref_t ** pppr,
++ MMAL_PORT_T * const port,
++ const unsigned int extra_buffers, MMAL_PORT_BH_CB_T callback);
++
++static inline int hw_mmal_pic_has_sub_bufs(picture_t * const pic)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic->context;
++ return ctx->buf_count > 1;
++}
++
++static inline void hw_mmal_pic_sub_buf_add(picture_t * const pic, MMAL_BUFFER_HEADER_T * const sub)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic->context;
++
++ if (ctx->buf_count >= CTX_BUFS_MAX) {
++ mmal_buffer_header_release(sub);
++ return;
++ }
++
++ ctx->bufs[ctx->buf_count++] = sub;
++}
++
++static inline MMAL_BUFFER_HEADER_T * hw_mmal_pic_sub_buf_get(picture_t * const pic, const unsigned int n)
++{
++ pic_ctx_mmal_t * const ctx = (pic_ctx_mmal_t *)pic->context;
++
++ return n + 1 > ctx->buf_count ? NULL : ctx->bufs[n + 1];
++}
++
++static inline bool hw_mmal_chroma_is_mmal(const vlc_fourcc_t chroma)
++{
++ return
++ chroma == VLC_CODEC_MMAL_OPAQUE ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND8 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND10 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND30 ||
++ chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ chroma == VLC_CODEC_MMAL_ZC_RGB32;
++}
++
++static inline bool hw_mmal_pic_is_mmal(const picture_t * const pic)
++{
++ return hw_mmal_chroma_is_mmal(pic->format.i_chroma);
++}
++
++picture_context_t * hw_mmal_pic_ctx_copy(picture_context_t * pic_ctx_cmn);
++void hw_mmal_pic_ctx_destroy(picture_context_t * pic_ctx_cmn);
++picture_context_t * hw_mmal_gen_context(
++ MMAL_BUFFER_HEADER_T * buf, hw_mmal_port_pool_ref_t * const ppr);
++
++int hw_mmal_get_gpu_mem(void);
++
++
++static inline MMAL_STATUS_T port_parameter_set_uint32(MMAL_PORT_T * port, uint32_t id, uint32_t val)
++{
++ const MMAL_PARAMETER_UINT32_T param = {
++ .hdr = {.id = id, .size = sizeof(MMAL_PARAMETER_UINT32_T)},
++ .value = val
++ };
++ return mmal_port_parameter_set(port, &param.hdr);
++}
++
++static inline MMAL_STATUS_T port_parameter_set_bool(MMAL_PORT_T * const port, const uint32_t id, const bool val)
++{
++ const MMAL_PARAMETER_BOOLEAN_T param = {
++ .hdr = {.id = id, .size = sizeof(MMAL_PARAMETER_BOOLEAN_T)},
++ .enable = val
++ };
++ return mmal_port_parameter_set(port, &param.hdr);
++}
++
++static inline MMAL_STATUS_T port_send_replicated(MMAL_PORT_T * const port, MMAL_POOL_T * const rep_pool,
++ MMAL_BUFFER_HEADER_T * const src_buf,
++ const uint64_t seq)
++{
++ MMAL_STATUS_T err;
++ MMAL_BUFFER_HEADER_T *const rep_buf = mmal_queue_wait(rep_pool->queue);
++
++ if (rep_buf == NULL)
++ return MMAL_ENOSPC;
++
++ if ((err = mmal_buffer_header_replicate(rep_buf, src_buf)) != MMAL_SUCCESS)
++ return err;
++
++ rep_buf->pts = seq;
++
++ if ((err = mmal_port_send_buffer(port, rep_buf)) != MMAL_SUCCESS)
++ {
++ mmal_buffer_header_release(rep_buf);
++ return err;
++ }
++
++ return MMAL_SUCCESS;
++}
++
++
++static inline void pic_to_buf_copy_props(MMAL_BUFFER_HEADER_T * const buf, const picture_t * const pic)
++{
++ if (!pic->b_progressive)
++ {
++ buf->flags |= MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED;
++ buf->type->video.flags |= MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED;
++ }
++ else
++ {
++ buf->flags &= ~MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED;
++ buf->type->video.flags &= ~MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED;
++ }
++ if (pic->b_top_field_first)
++ {
++ buf->flags |= MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST;
++ buf->type->video.flags |= MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST;
++ }
++ else
++ {
++ buf->flags &= ~MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST;
++ buf->type->video.flags &= ~MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST;
++ }
++ buf->pts = pic->date != VLC_TICK_INVALID ? pic->date : MMAL_TIME_UNKNOWN;
++ buf->dts = buf->pts;
++}
++
++static inline void buf_to_pic_copy_props(picture_t * const pic, const MMAL_BUFFER_HEADER_T * const buf)
++{
++ // Contrary to docn the interlace & tff flags turn up in the header flags rather than the
++ // video specific flags (which appear to be currently unused).
++ pic->b_progressive = (buf->flags & MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED) == 0;
++ pic->b_top_field_first = (buf->flags & MMAL_BUFFER_HEADER_VIDEO_FLAG_TOP_FIELD_FIRST) != 0;
++
++ pic->date = buf->pts != MMAL_TIME_UNKNOWN ? buf->pts :
++ buf->dts != MMAL_TIME_UNKNOWN ? buf->dts :
++ VLC_TICK_INVALID;
++}
++
++MMAL_BUFFER_HEADER_T * hw_mmal_pic_buf_copied(const picture_t *const pic,
++ MMAL_POOL_T * const rep_pool,
++ MMAL_PORT_T * const port,
++ cma_buf_pool_t * const cbp);
++
++MMAL_BUFFER_HEADER_T * hw_mmal_pic_buf_replicated(const picture_t *const pic, MMAL_POOL_T * const rep_pool);
++
++struct vzc_pool_ctl_s;
++typedef struct vzc_pool_ctl_s vzc_pool_ctl_t;
++
++// At the moment we cope with any mono-planar RGBA thing
++// We could cope with many other things but they currently don't occur
++extern const vlc_fourcc_t hw_mmal_vzc_subpicture_chromas[];
++static inline bool hw_mmal_vzc_subpic_fmt_valid(const video_frame_format_t * const vf_vlc)
++{
++ const vlc_fourcc_t vfcc_src = vf_vlc->i_chroma;
++ for (const vlc_fourcc_t * p = hw_mmal_vzc_subpicture_chromas; *p != 0; ++p)
++ if (*p == vfcc_src)
++ return true;
++
++ return false;
++}
++
++bool hw_mmal_vzc_buf_set_format(MMAL_BUFFER_HEADER_T * const buf, MMAL_ES_FORMAT_T * const es_fmt);
++MMAL_DISPLAYREGION_T * hw_mmal_vzc_buf_region(MMAL_BUFFER_HEADER_T * const buf);
++void hw_mmal_vzc_buf_scale_dest_rect(MMAL_BUFFER_HEADER_T * const buf, const MMAL_RECT_T * const scale_rect, const MMAL_DISPLAYTRANSFORM_T scale_transform);
++void hw_mmal_vzc_buf_get_wh(MMAL_BUFFER_HEADER_T * const buf, int * const pW, int * const pH);
++unsigned int hw_mmal_vzc_buf_seq(MMAL_BUFFER_HEADER_T * const buf);
++MMAL_BUFFER_HEADER_T * hw_mmal_vzc_buf_from_pic(vzc_pool_ctl_t * const pc, picture_t * const pic,
++ const MMAL_RECT_T dst_pic_rect,
++ const int x_offset, const int y_offset,
++ const unsigned int alpha, const bool is_first);
++void hw_mmal_vzc_buf_frame_size(MMAL_BUFFER_HEADER_T * const buf,
++ uint32_t * const pWidth, uint32_t * const pHeight);
++
++void hw_mmal_vzc_pool_flush(vzc_pool_ctl_t * const pc);
++void hw_mmal_vzc_pool_release(vzc_pool_ctl_t * const pc);
++void hw_mmal_vzc_pool_ref(vzc_pool_ctl_t * const pc);
++vzc_pool_ctl_t * hw_mmal_vzc_pool_new(void);
++
++
++static inline MMAL_RECT_T vis_mmal_rect(const video_format_t * const fmt)
++{
++ return (MMAL_RECT_T){
++ .x = fmt->i_x_offset,
++ .y = fmt->i_y_offset,
++ .width = fmt->i_visible_width,
++ .height = fmt->i_visible_height
++ };
++}
++
++int cma_pic_set_data(picture_t * const pic,
++ const MMAL_ES_FORMAT_T * const mm_esfmt,
++ const MMAL_BUFFER_HEADER_T * const buf);
++
++// Attaches cma buf to pic
++// Marks in_flight if not all_in_flight anyway
++int cma_buf_pic_attach(cma_buf_t * const cb, picture_t * const pic);
++// Returns a pointer to the cma_buf attached to the pic
++// Just a pointer - doesn't add a ref
++cma_buf_t * cma_buf_pic_get(picture_t * const pic);
++
++static inline bool is_cma_buf_pic_chroma(const uint32_t chroma)
++{
++ return chroma == VLC_CODEC_MMAL_ZC_RGB32 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND8 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND10 ||
++ chroma == VLC_CODEC_MMAL_ZC_SAND30 ||
++ chroma == VLC_CODEC_MMAL_ZC_I420;
++}
++
++
++int rpi_get_model_type(void);
++bool rpi_is_model_pi4(void);
++bool rpi_is_fkms_active(void);
++
++typedef enum vcsm_init_type_e {
++ VCSM_INIT_NONE = 0,
++ VCSM_INIT_LEGACY,
++ VCSM_INIT_CMA
++} vcsm_init_type_t;
++
++vcsm_init_type_t cma_vcsm_init(void);
++void cma_vcsm_exit(const vcsm_init_type_t init_mode);
++vcsm_init_type_t cma_vcsm_type(void);
++const char * cma_vcsm_init_str(const vcsm_init_type_t init_mode);
++
+
+- MMAL_BUFFER_HEADER_T *buffer;
+- bool displayed;
+-};
++#define VOUT_DISPLAY_CHANGE_MMAL_BASE 1024
++#define VOUT_DISPLAY_CHANGE_MMAL_HIDE (VOUT_DISPLAY_CHANGE_MMAL_BASE + 0)
+
+-int mmal_picture_lock(picture_t *picture);
++#define MMAL_COMPONENT_DEFAULT_RESIZER "vc.ril.resize"
++#define MMAL_COMPONENT_ISP_RESIZER "vc.ril.isp"
++#define MMAL_COMPONENT_HVS "vc.ril.hvs"
+
+ #endif
+--- /dev/null
++++ b/modules/hw/mmal/rpi_prof.h
+@@ -0,0 +1,110 @@
++#ifndef RPI_PROFILE_H
++#define RPI_PROFILE_H
++
++#include <stdint.h>
++#include <inttypes.h>
++
++#ifndef RPI_PROFILE
++#define RPI_PROFILE 0
++#endif
++
++#if RPI_PROFILE
++
++#include "v7_pmu.h"
++
++#ifdef RPI_PROC_ALLOC
++#define X volatile
++#define Z =0
++#else
++#define X extern volatile
++#define Z
++#endif
++
++X uint64_t av_rpi_prof0_cycles Z;
++X unsigned int av_rpi_prof0_cnt Z;
++#define RPI_prof0_MAX_DURATION 100000
++
++X uint64_t av_rpi_prof1_cycles Z;
++X unsigned int av_rpi_prof1_cnt Z;
++#define RPI_prof1_MAX_DURATION 100000
++
++X uint64_t av_rpi_prof2_cycles Z;
++X unsigned int av_rpi_prof2_cnt Z;
++#define RPI_prof2_MAX_DURATION 10000
++
++X uint64_t av_rpi_prof_n_cycles[128];
++X unsigned int av_rpi_prof_n_cnt[128];
++#define RPI_prof_n_MAX_DURATION 10000
++
++
++#undef X
++#undef Z
++
++#define PROFILE_INIT()\
++do {\
++ enable_pmu();\
++ enable_ccnt();\
++} while (0)
++
++#define PROFILE_START()\
++do {\
++ volatile uint32_t perf_1 = read_ccnt();\
++ volatile uint32_t perf_2
++
++
++#define PROFILE_ACC(x)\
++ perf_2 = read_ccnt();\
++ {\
++ const uint32_t duration = perf_2 - perf_1;\
++ if (duration < RPI_##x##_MAX_DURATION)\
++ {\
++ av_rpi_##x##_cycles += duration;\
++ av_rpi_##x##_cnt += 1;\
++ }\
++ }\
++} while(0)
++
++
++#define PROFILE_ACC_N(n)\
++ if ((n) >= 0) {\
++ perf_2 = read_ccnt();\
++ {\
++ const uint32_t duration = perf_2 - perf_1;\
++ if (duration < RPI_prof_n_MAX_DURATION)\
++ {\
++ av_rpi_prof_n_cycles[n] += duration;\
++ av_rpi_prof_n_cnt[n] += 1;\
++ }\
++ }\
++ }\
++} while(0)
++
++#define PROFILE_PRINTF(x)\
++ printf("%-20s cycles=%14" PRIu64 "; cnt=%8u; avg=%5" PRIu64 "\n", #x, av_rpi_##x##_cycles, av_rpi_##x##_cnt,\
++ av_rpi_##x##_cnt == 0 ? (uint64_t)0 : av_rpi_##x##_cycles / (uint64_t)av_rpi_##x##_cnt)
++
++#define PROFILE_PRINTF_N(n)\
++ printf("prof[%d] cycles=%14" PRIu64 "; cnt=%8u; avg=%5" PRIu64 "\n", (n), av_rpi_prof_n_cycles[n], av_rpi_prof_n_cnt[n],\
++ av_rpi_prof_n_cnt[n] == 0 ? (uint64_t)0 : av_rpi_prof_n_cycles[n] / (uint64_t)av_rpi_prof_n_cnt[n])
++
++#define PROFILE_CLEAR_N(n) \
++do {\
++ av_rpi_prof_n_cycles[n] = 0;\
++ av_rpi_prof_n_cnt[n] = 0;\
++} while(0)
++
++#else
++
++// No profile
++#define PROFILE_INIT()
++#define PROFILE_START()
++#define PROFILE_ACC(x)
++#define PROFILE_ACC_N(x)
++#define PROFILE_PRINTF(x)
++#define PROFILE_PRINTF_N(x)
++#define PROFILE_CLEAR_N(n)
++
++#endif
++
++#endif
++
+--- /dev/null
++++ b/modules/hw/mmal/subpic.c
+@@ -0,0 +1,257 @@
++/*****************************************************************************
++ * mmal.c: MMAL-based decoder plugin for Raspberry Pi
++ *****************************************************************************
++ * Authors: jc@kynesim.co.uk
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU Lesser General Public License as published by
++ * the Free Software Foundation; either version 2.1 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public License
++ * along with this program; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
++ *****************************************************************************/
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <stdatomic.h>
++
++#include <vlc_common.h>
++#include <vlc_plugin.h>
++#include <vlc_codec.h>
++#include <vlc_filter.h>
++#include <vlc_threads.h>
++
++#include <bcm_host.h>
++#include <interface/mmal/mmal.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++
++#include "mmal_picture.h"
++#include "subpic.h"
++
++
++#define TRACE_ALL 0
++
++static inline bool cmp_rect(const MMAL_RECT_T * const a, const MMAL_RECT_T * const b)
++{
++ return a->x == b->x && a->y == b->y && a->width == b->width && a->height == b->height;
++}
++
++void hw_mmal_subpic_flush(vlc_object_t * const p_filter, subpic_reg_stash_t * const sub)
++{
++ VLC_UNUSED(p_filter);
++ if (sub->port != NULL && sub->port->is_enabled)
++ mmal_port_disable(sub->port);
++ sub->seq = 0;
++}
++
++void hw_mmal_subpic_close(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe)
++{
++ hw_mmal_subpic_flush(p_filter, spe);
++
++ if (spe->pool != NULL)
++ mmal_pool_destroy(spe->pool);
++
++ // Zap to avoid any accidental reuse
++ *spe = (subpic_reg_stash_t){NULL};
++}
++
++MMAL_STATUS_T hw_mmal_subpic_open(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe, MMAL_PORT_T * const port,
++ const int display_id, const unsigned int layer)
++{
++ MMAL_STATUS_T err;
++
++ // Start by zapping all to zero
++ *spe = (subpic_reg_stash_t){NULL};
++
++ if ((err = port_parameter_set_bool(port, MMAL_PARAMETER_ZERO_COPY, true)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Failed to set sub port zero copy");
++ return err;
++ }
++
++ if ((spe->pool = mmal_pool_create(30, 0)) == NULL)
++ {
++ msg_Err(p_filter, "Failed to create sub pool");
++ return MMAL_ENOMEM;
++ }
++
++ port->userdata = (void *)p_filter;
++ spe->port = port;
++ spe->display_id = display_id;
++ spe->layer = layer;
++
++ return MMAL_SUCCESS;
++}
++
++static void conv_subpic_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++#if TRACE_ALL
++ msg_Dbg((filter_t *)port->userdata, "<<< %s cmd=%d, user=%p, buf=%p, flags=%#x, len=%d/%d, pts=%lld",
++ __func__, buf->cmd, buf->user_data, buf, buf->flags, buf->length, buf->alloc_size, (long long)buf->pts);
++#else
++ VLC_UNUSED(port);
++#endif
++
++ mmal_buffer_header_release(buf); // Will extract & release pic in pool callback
++}
++
++static int
++subpic_send_empty(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe, const uint64_t pts)
++{
++ MMAL_BUFFER_HEADER_T *const buf = mmal_queue_wait(spe->pool->queue);
++ MMAL_STATUS_T err;
++
++ if (buf == NULL) {
++ msg_Err(p_filter, "Buffer get for subpic failed");
++ return -1;
++ }
++#if TRACE_ALL
++ msg_Dbg(p_filter, "Remove pic for sub %d", spe->seq);
++#endif
++ buf->cmd = 0;
++ buf->data = NULL;
++ buf->alloc_size = 0;
++ buf->offset = 0;
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++ buf->pts = pts;
++ buf->dts = MMAL_TIME_UNKNOWN;
++ buf->user_data = NULL;
++
++ if ((err = mmal_port_send_buffer(spe->port, buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to subput failed");
++ mmal_buffer_header_release(buf);
++ return -1;
++ }
++ return 0;
++}
++
++// < 0 Error
++// 0 Done & stop
++// 1 Done & continue
++
++int hw_mmal_subpic_update(vlc_object_t * const p_filter,
++ MMAL_BUFFER_HEADER_T * const sub_buf,
++ subpic_reg_stash_t * const spe,
++ const video_format_t * const fmt,
++ const MMAL_RECT_T * const scale_out,
++ const MMAL_DISPLAYTRANSFORM_T transform_out,
++ const uint64_t pts)
++{
++ MMAL_STATUS_T err;
++
++ if (sub_buf == NULL)
++ {
++ if (spe->port->is_enabled && spe->seq != 0)
++ {
++ subpic_send_empty(p_filter, spe, pts);
++ spe->seq = 0;
++ }
++ }
++ else
++ {
++ const unsigned int seq = hw_mmal_vzc_buf_seq(sub_buf);
++ bool needs_update = (spe->seq != seq);
++
++ hw_mmal_vzc_buf_scale_dest_rect(sub_buf, scale_out, transform_out);
++
++ if (hw_mmal_vzc_buf_set_format(sub_buf, spe->port->format))
++ {
++ MMAL_DISPLAYREGION_T * const dreg = hw_mmal_vzc_buf_region(sub_buf);
++ MMAL_VIDEO_FORMAT_T *const v_fmt = &spe->port->format->es->video;
++
++ v_fmt->frame_rate.den = fmt->i_frame_rate_base;
++ v_fmt->frame_rate.num = fmt->i_frame_rate;
++ v_fmt->par.den = fmt->i_sar_den;
++ v_fmt->par.num = fmt->i_sar_num;
++ v_fmt->color_space = MMAL_COLOR_SPACE_UNKNOWN;
++
++ if (needs_update || dreg->alpha != spe->alpha || !cmp_rect(&dreg->dest_rect, &spe->dest_rect)) {
++
++ spe->alpha = dreg->alpha;
++ spe->dest_rect = dreg->dest_rect;
++ needs_update = true;
++
++ if (spe->display_id >= 0)
++ {
++ dreg->display_num = spe->display_id;
++ dreg->set |= MMAL_DISPLAY_SET_NUM;
++ }
++ dreg->layer = spe->layer;
++ dreg->set |= MMAL_DISPLAY_SET_LAYER;
++
++#if TRACE_ALL
++ msg_Dbg(p_filter, "%s: Update region: Set=%x, dest=%dx%d @ (%d,%d), src=%dx%d @ (%d,%d), layer=%d, alpha=%#x",
++ __func__, dreg->set,
++ dreg->dest_rect.width, dreg->dest_rect.height, dreg->dest_rect.x, dreg->dest_rect.y,
++ dreg->src_rect.width, dreg->src_rect.height, dreg->src_rect.x, dreg->src_rect.y,
++ dreg->layer, dreg->alpha);
++#endif
++
++ // If now completely offscreen just flush this & return
++ // We only do -ve as (a) that is easy and (b) it seems to be
++ // something that can confuse mmal
++ if (dreg->dest_rect.y + dreg->dest_rect.height <= 0 ||
++ dreg->dest_rect.x + dreg->dest_rect.width <= 0)
++ {
++ if (spe->port->is_enabled)
++ subpic_send_empty(p_filter, spe, pts);
++ spe->seq = seq;
++ return 1;
++ }
++
++ if ((err = mmal_port_parameter_set(spe->port, &dreg->hdr)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Set display region on subput failed");
++ return -1;
++ }
++
++ if ((err = mmal_port_format_commit(spe->port)) != MMAL_SUCCESS)
++ {
++ msg_Dbg(p_filter, "%s: Subpic commit fail: %d", __func__, err);
++ return -1;
++ }
++ }
++ }
++
++ if (!spe->port->is_enabled)
++ {
++ spe->port->buffer_num = 30;
++ spe->port->buffer_size = spe->port->buffer_size_recommended; // Not used but shuts up the error checking
++
++ if ((err = mmal_port_enable(spe->port, conv_subpic_cb)) != MMAL_SUCCESS)
++ {
++ msg_Dbg(p_filter, "%s: Subpic enable fail: %d", __func__, err);
++ return -1;
++ }
++ }
++
++ if (needs_update)
++ {
++#if TRACE_ALL
++ msg_Dbg(p_filter, "Update pic for sub %d", spe->seq);
++#endif
++ if ((err = port_send_replicated(spe->port, spe->pool, sub_buf, pts)) != MMAL_SUCCESS)
++ {
++ msg_Err(p_filter, "Send buffer to subput failed");
++ return -1;
++ }
++
++ spe->seq = seq;
++ }
++ }
++ return 1;
++}
++
++
++
+--- /dev/null
++++ b/modules/hw/mmal/subpic.h
+@@ -0,0 +1,33 @@
++#ifndef VLC_HW_MMAL_SUBPIC_H_
++#define VLC_HW_MMAL_SUBPIC_H_
++
++typedef struct subpic_reg_stash_s
++{
++ MMAL_PORT_T * port;
++ MMAL_POOL_T * pool;
++ int display_id; // -1 => do not set
++ unsigned int layer;
++ // Shadow vars so we can tell if stuff has changed
++ MMAL_RECT_T dest_rect;
++ unsigned int alpha;
++ unsigned int seq;
++} subpic_reg_stash_t;
++
++int hw_mmal_subpic_update(vlc_object_t * const p_filter,
++ MMAL_BUFFER_HEADER_T * const sub_buf,
++ subpic_reg_stash_t * const spe,
++ const video_format_t * const fmt,
++ const MMAL_RECT_T * const scale_out,
++ const MMAL_DISPLAYTRANSFORM_T transform_out,
++ const uint64_t pts);
++
++void hw_mmal_subpic_flush(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe);
++
++void hw_mmal_subpic_close(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe);
++
++// If display id is -1 it will be unset
++MMAL_STATUS_T hw_mmal_subpic_open(vlc_object_t * const p_filter, subpic_reg_stash_t * const spe, MMAL_PORT_T * const port,
++ const int display_id, const unsigned int layer);
++
++#endif
++
+--- /dev/null
++++ b/modules/hw/mmal/transform_ops.h
+@@ -0,0 +1,99 @@
++#ifndef VLC_MMAL_TRANSFORM_OPS_H
++#define VLC_MMAL_TRANSFORM_OPS_H
++
++#include <vlc_common.h>
++#include <vlc_picture.h>
++#include <interface/mmal/mmal.h>
++
++
++// These are enums with the same order so simply coerce
++static inline MMAL_DISPLAYTRANSFORM_T vlc_to_mmal_transform(const video_orientation_t orientation){
++ return (MMAL_DISPLAYTRANSFORM_T)orientation;
++}
++
++// MMAL headers comment these (getting 2 a bit wrong) but do not give
++// defines
++#define XFORM_H_SHIFT 0 // Hflip
++#define XFORM_V_SHIFT 1 // Vflip
++#define XFORM_T_SHIFT 2 // Transpose
++#define XFORM_H_BIT (1 << XFORM_H_SHIFT)
++#define XFORM_V_BIT (1 << XFORM_V_SHIFT)
++#define XFORM_T_BIT (1 << XFORM_T_SHIFT)
++
++static inline bool
++is_transform_transpose(const MMAL_DISPLAYTRANSFORM_T t)
++{
++ return ((unsigned int)t & XFORM_T_BIT) != 0;
++}
++
++static inline bool
++is_transform_hflip(const MMAL_DISPLAYTRANSFORM_T t)
++{
++ return ((unsigned int)t & XFORM_H_BIT) != 0;
++}
++
++static inline bool
++is_transform_vflip(const MMAL_DISPLAYTRANSFORM_T t)
++{
++ return ((unsigned int)t & XFORM_V_BIT) != 0;
++}
++
++static inline MMAL_DISPLAYTRANSFORM_T
++swap_transform_hv(const MMAL_DISPLAYTRANSFORM_T x)
++{
++ return (((x >> XFORM_H_SHIFT) & 1) << XFORM_V_SHIFT) |
++ (((x >> XFORM_V_SHIFT) & 1) << XFORM_H_SHIFT) |
++ (x & XFORM_T_BIT);
++}
++
++static inline MMAL_DISPLAYTRANSFORM_T
++transform_inverse(const MMAL_DISPLAYTRANSFORM_T x)
++{
++ return is_transform_transpose(x) ? swap_transform_hv(x) : x;
++}
++
++// Transform generated by A then B
++// All ops are self inverse so can simply be XORed on their own
++// H & V flips after a transpose need to be swapped
++static inline MMAL_DISPLAYTRANSFORM_T
++combine_transform(const MMAL_DISPLAYTRANSFORM_T a, const MMAL_DISPLAYTRANSFORM_T b)
++{
++ return a ^ (is_transform_transpose(a) ? swap_transform_hv(b) : b);
++}
++
++static inline MMAL_RECT_T
++rect_transpose(const MMAL_RECT_T s)
++{
++ return (MMAL_RECT_T){
++ .x = s.y,
++ .y = s.x,
++ .width = s.height,
++ .height = s.width
++ };
++}
++
++// hflip s in c
++static inline MMAL_RECT_T rect_hflip(const MMAL_RECT_T s, const MMAL_RECT_T c)
++{
++ return (MMAL_RECT_T){
++ .x = c.x + (c.x + c.width) - (s.x + s.width),
++ .y = s.y,
++ .width = s.width,
++ .height = s.height
++ };
++}
++
++// vflip s in c
++static inline MMAL_RECT_T rect_vflip(const MMAL_RECT_T s, const MMAL_RECT_T c)
++{
++ return (MMAL_RECT_T){
++ .x = s.x,
++ .y = (c.y + c.height) - (s.y - c.y) - s.height,
++ .width = s.width,
++ .height = s.height
++ };
++}
++
++
++#endif
++
+--- /dev/null
++++ b/modules/hw/mmal/v7_pmu.S
+@@ -0,0 +1,263 @@
++/*------------------------------------------------------------
++Performance Monitor Block
++------------------------------------------------------------*/
++ .arm @ Make sure we are in ARM mode.
++ .text
++ .align 2
++ .global getPMN @ export this function for the linker
++
++/* Returns the number of progammable counters uint32_t getPMN(void) */
++
++getPMN:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC Register */
++ MOV r0, r0, LSR #11 /* Shift N field down to bit 0 */
++ AND r0, r0, #0x1F /* Mask to leave just the 5 N bits */
++ BX lr
++
++
++
++ .global pmn_config @ export this function for the linker
++ /* Sets the event for a programmable counter to record */
++ /* void pmn_config(unsigned counter, uint32_t event) */
++ /* counter = r0 = Which counter to program (e.g. 0 for PMN0, 1 for PMN1 */
++ /* event = r1 = The event code */
++pmn_config:
++ AND r0, r0, #0x1F /* Mask to leave only bits 4:0 */
++ MCR p15, 0, r0, c9, c12, 5 /* Write PMNXSEL Register */
++ MCR p15, 0, r1, c9, c13, 1 /* Write EVTSELx Register */
++ BX lr
++
++
++
++ .global ccnt_divider @ export this function for the linker
++ /* Enables/disables the divider (1/64) on CCNT */
++ /* void ccnt_divider(int divider) */
++ /* divider = r0 = If 0 disable divider, else enable dvider */
++ccnt_divider:
++ MRC p15, 0, r1, c9, c12, 0 /* Read PMNC */
++
++ CMP r0, #0x0 /* IF (r0 == 0) */
++ BICEQ r1, r1, #0x08 /* THEN: Clear the D bit (disables the */
++ ORRNE r1, r1, #0x08 /* ELSE: Set the D bit (enables the di */
++
++ MCR p15, 0, r1, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++ /* --------------------------------------------------------------- */
++ /* Enable/Disable */
++ /* --------------------------------------------------------------- */
++
++ .global enable_pmu @ export this function for the linker
++ /* Global PMU enable */
++ /* void enable_pmu(void) */
++enable_pmu:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC */
++ ORR r0, r0, #0x01 /* Set E bit */
++ MCR p15, 0, r0, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++
++ .global disable_pmu @ export this function for the linker
++ /* Global PMU disable */
++ /* void disable_pmu(void) */
++disable_pmu:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC */
++ BIC r0, r0, #0x01 /* Clear E bit */
++ MCR p15, 0, r0, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++
++ .global enable_ccnt @ export this function for the linker
++ /* Enable the CCNT */
++ /* void enable_ccnt(void) */
++enable_ccnt:
++ MOV r0, #0x80000000 /* Set C bit */
++ MCR p15, 0, r0, c9, c12, 1 /* Write CNTENS Register */
++ BX lr
++
++
++
++ .global disable_ccnt @ export this function for the linker
++ /* Disable the CCNT */
++ /* void disable_ccnt(void) */
++disable_ccnt:
++ MOV r0, #0x80000000 /* Clear C bit */
++ MCR p15, 0, r0, c9, c12, 2 /* Write CNTENC Register */
++ BX lr
++
++
++
++ .global enable_pmn @ export this function for the linker
++ /* Enable PMN{n} */
++ /* void enable_pmn(uint32_t counter) */
++ /* counter = r0 = The counter to enable (e.g. 0 for PMN0, 1 for PMN1)
++enable_pmn: */
++ MOV r1, #0x1 /* Use arg (r0) to set which counter t */
++ MOV r1, r1, LSL r0
++
++ MCR p15, 0, r1, c9, c12, 1 /* Write CNTENS Register */
++ BX lr
++
++
++
++ .global disable_pmn @ export this function for the linker
++ /* Enable PMN{n} */
++ /* void disable_pmn(uint32_t counter) */
++ /* counter = r0 = The counter to enable (e.g. 0 for PMN0, 1 for PMN1)
++disable_pmn: */
++ MOV r1, #0x1 /* Use arg (r0) to set which counter t */
++ MOV r1, r1, LSL r0
++
++ MCR p15, 0, r1, c9, c12, 1 /* Write CNTENS Register */
++ BX lr
++
++
++
++ .global enable_pmu_user_access @ export this function for the linker
++ /* Enables User mode access to the PMU (must be called in a priviledge */
++ /* void enable_pmu_user_access(void) */
++enable_pmu_user_access:
++ MRC p15, 0, r0, c9, c14, 0 /* Read PMUSERENR Register */
++ ORR r0, r0, #0x01 /* Set EN bit (bit 0) */
++ MCR p15, 0, r0, c9, c14, 0 /* Write PMUSERENR Register */
++ BX lr
++
++
++
++ .global disable_pmu_user_access @ export this function for the linke
++ /* Disables User mode access to the PMU (must be called in a priviledg */
++ /* void disable_pmu_user_access(void) */
++disable_pmu_user_access:
++ MRC p15, 0, r0, c9, c14, 0 /* Read PMUSERENR Register */
++ BIC r0, r0, #0x01 /* Clear EN bit (bit 0) */
++ MCR p15, 0, r0, c9, c14, 0 /* Write PMUSERENR Register */
++ BX lr
++
++
++ /* --------------------------------------------------------------- */
++ /* Counter read registers */
++ /* --------------------------------------------------------------- */
++
++ .global read_ccnt @ export this function for the linker
++ /* Returns the value of CCNT */
++ /* uint32_t read_ccnt(void) */
++read_ccnt:
++ MRC p15, 0, r0, c9, c13, 0 /* Read CCNT Register */
++ BX lr
++
++
++ .global read_pmn @ export this function for the linker
++ /* Returns the value of PMN{n} */
++ /* uint32_t read_pmn(uint32_t counter) */
++ /* counter = r0 = The counter to read (e.g. 0 for PMN0, 1 for PMN1) *
++read_pmn: */
++ AND r0, r0, #0x1F /* Mask to leave only bits 4:0 */
++ MCR p15, 0, r0, c9, c12, 5 /* Write PMNXSEL Register */
++ MRC p15, 0, r0, c9, c13, 2 /* Read current PMNx Register */
++ BX lr
++
++
++ /* --------------------------------------------------------------- */
++ /* Software Increment */
++ /* --------------------------------------------------------------- */
++
++ .global pmu_software_increment @ export this function for the linker
++ /* Writes to software increment register */
++ /* void pmu_software_increment(uint32_t counter) */
++ /* counter = r0 = The counter to increment (e.g. 0 for PMN0, 1 for PMN
++pmu_software_increment: */
++ MOV r1, #0x01
++ MOV r1, r1, LSL r0
++ MCR p15, 0, r1, c9, c12, 4 /* Write SWINCR Register */
++ BX lr
++
++ /* --------------------------------------------------------------- */
++ /* Overflow & Interrupt Generation */
++ /* --------------------------------------------------------------- */
++
++ .global read_flags @ export this function for the linker
++ /* Returns the value of the overflow flags */
++ /* uint32_t read_flags(void) */
++read_flags:
++ MRC p15, 0, r0, c9, c12, 3 /* Read FLAG Register */
++ BX lr
++
++
++ .global write_flags @ export this function for the linker
++ /* Writes the overflow flags */
++ /* void write_flags(uint32_t flags) */
++write_flags:
++ MCR p15, 0, r0, c9, c12, 3 /* Write FLAG Register */
++ BX lr
++
++
++ .global enable_ccnt_irq @ export this function for the linker
++ /* Enables interrupt generation on overflow of the CCNT */
++ /* void enable_ccnt_irq(void) */
++enable_ccnt_irq:
++ MOV r0, #0x80000000
++ MCR p15, 0, r0, c9, c14, 1 /* Write INTENS Register */
++ BX lr
++
++ .global disable_ccnt_irq @ export this function for the linker
++ /* Disables interrupt generation on overflow of the CCNT */
++ /* void disable_ccnt_irq(void) */
++disable_ccnt_irq:
++ MOV r0, #0x80000000
++ MCR p15, 0, r0, c9, c14, 2 /* Write INTENC Register */
++ BX lr
++
++
++ .global enable_pmn_irq @ export this function for the linker
++ /* Enables interrupt generation on overflow of PMN{x} */
++ /* void enable_pmn_irq(uint32_t counter) */
++ /* counter = r0 = The counter to enable the interrupt for (e.g. 0 for
++enable_pmn_irq: */
++ MOV r1, #0x1 /* Use arg (r0) to set which counter */
++ MOV r0, r1, LSL r0
++ MCR p15, 0, r0, c9, c14, 1 /* Write INTENS Register */
++ BX lr
++
++ .global disable_pmn_irq @ export this function for the linker
++ /* Disables interrupt generation on overflow of PMN{x} */
++ /* void disable_pmn_irq(uint32_t counter) */
++ /* counter = r0 = The counter to disable the interrupt for (e.g. 0 fo
++disable_pmn_irq: */
++ MOV r1, #0x1 /* Use arg (r0) to set which counter t */
++ MOV r0, r1, LSL r0
++ MCR p15, 0, r0, c9, c14, 2 /* Write INTENC Register */
++ BX lr
++
++ /* --------------------------------------------------------------- */
++ /* Reset Functions */
++ /* --------------------------------------------------------------- */
++
++ .global reset_pmn @ export this function for the linker
++ /* Resets the programmable counters */
++ /* void reset_pmn(void) */
++reset_pmn:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC */
++ ORR r0, r0, #0x02 /* Set P bit (Event Counter Reset) */
++ MCR p15, 0, r0, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++ .global reset_ccnt @ export this function for the linker
++ /* Resets the CCNT */
++ /* void reset_ccnt(void) */
++reset_ccnt:
++ MRC p15, 0, r0, c9, c12, 0 /* Read PMNC */
++ ORR r0, r0, #0x04 /* Set C bit (Event Counter Reset) */
++ MCR p15, 0, r0, c9, c12, 0 /* Write PMNC */
++ BX lr
++
++
++ .end @end of code, this line is optional.
++/* ------------------------------------------------------------ */
++/* End of v7_pmu.s */
++/* ------------------------------------------------------------ */
++
++
+--- /dev/null
++++ b/modules/hw/mmal/v7_pmu.h
+@@ -0,0 +1,113 @@
++// ------------------------------------------------------------
++// PMU for Cortex-A/R (v7-A/R)
++// ------------------------------------------------------------
++
++#ifndef _V7_PMU_H
++#define _V7_PMU_H
++
++// Returns the number of progammable counters
++unsigned int getPMN(void);
++
++// Sets the event for a programmable counter to record
++// counter = r0 = Which counter to program (e.g. 0 for PMN0, 1 for PMN1)
++// event = r1 = The event code (from appropiate TRM or ARM Architecture Reference Manual)
++void pmn_config(unsigned int counter, unsigned int event);
++
++// Enables/disables the divider (1/64) on CCNT
++// divider = r0 = If 0 disable divider, else enable dvider
++void ccnt_divider(int divider);
++
++//
++// Enables and disables
++//
++
++// Global PMU enable
++// On ARM11 this enables the PMU, and the counters start immediately
++// On Cortex this enables the PMU, there are individual enables for the counters
++void enable_pmu(void);
++
++// Global PMU disable
++// On Cortex, this overrides the enable state of the individual counters
++void disable_pmu(void);
++
++// Enable the CCNT
++void enable_ccnt(void);
++
++// Disable the CCNT
++void disable_ccnt(void);
++
++// Enable PMN{n}
++// counter = The counter to enable (e.g. 0 for PMN0, 1 for PMN1)
++void enable_pmn(unsigned int counter);
++
++// Enable PMN{n}
++// counter = The counter to enable (e.g. 0 for PMN0, 1 for PMN1)
++void disable_pmn(unsigned int counter);
++
++//
++// Read counter values
++//
++
++// Returns the value of CCNT
++unsigned int read_ccnt(void);
++
++// Returns the value of PMN{n}
++// counter = The counter to read (e.g. 0 for PMN0, 1 for PMN1)
++unsigned int read_pmn(unsigned int counter);
++
++//
++// Overflow and interrupts
++//
++
++// Returns the value of the overflow flags
++unsigned int read_flags(void);
++
++// Writes the overflow flags
++void write_flags(unsigned int flags);
++
++// Enables interrupt generation on overflow of the CCNT
++void enable_ccnt_irq(void);
++
++// Disables interrupt generation on overflow of the CCNT
++void disable_ccnt_irq(void);
++
++// Enables interrupt generation on overflow of PMN{x}
++// counter = The counter to enable the interrupt for (e.g. 0 for PMN0, 1 for PMN1)
++void enable_pmn_irq(unsigned int counter);
++
++// Disables interrupt generation on overflow of PMN{x}
++// counter = r0 = The counter to disable the interrupt for (e.g. 0 for PMN0, 1 for PMN1)
++void disable_pmn_irq(unsigned int counter);
++
++//
++// Counter reset functions
++//
++
++// Resets the programmable counters
++void reset_pmn(void);
++
++// Resets the CCNT
++void reset_ccnt(void);
++
++//
++// Software Increment
++
++// Writes to software increment register
++// counter = The counter to increment (e.g. 0 for PMN0, 1 for PMN1)
++void pmu_software_increment(unsigned int counter);
++
++//
++// User mode access
++//
++
++// Enables User mode access to the PMU (must be called in a priviledged mode)
++void enable_pmu_user_access(void);
++
++// Disables User mode access to the PMU (must be called in a priviledged mode)
++void disable_pmu_user_access(void);
++
++#endif
++// ------------------------------------------------------------
++// End of v7_pmu.h
++// ------------------------------------------------------------
++
+--- a/modules/hw/mmal/vout.c
++++ b/modules/hw/mmal/vout.c
+@@ -27,21 +27,28 @@
+ #endif
+
+ #include <math.h>
++#include <stdatomic.h>
+
+ #include <vlc_common.h>
+-#include <vlc_atomic.h>
+ #include <vlc_plugin.h>
+ #include <vlc_threads.h>
+ #include <vlc_vout_display.h>
++#include <vlc_modules.h>
+
+-#include "mmal_picture.h"
+-
++#pragma GCC diagnostic push
++#pragma GCC diagnostic ignored "-Wbad-function-cast"
+ #include <bcm_host.h>
++#pragma GCC diagnostic pop
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/util/mmal_util.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+ #include <interface/vmcs_host/vc_tvservice.h>
+-#include <interface/vmcs_host/vc_dispmanx.h>
++
++#include "mmal_picture.h"
++#include "subpic.h"
++#include "transform_ops.h"
++
++#define TRACE_ALL 0
+
+ #define MAX_BUFFERS_IN_TRANSIT 1
+ #define VC_TV_MAX_MODE_IDS 127
+@@ -50,10 +57,28 @@
+ #define MMAL_LAYER_TEXT N_("VideoCore layer where the video is displayed.")
+ #define MMAL_LAYER_LONGTEXT N_("VideoCore layer where the video is displayed. Subpictures are displayed directly above and a black background directly below.")
+
+-#define MMAL_BLANK_BACKGROUND_NAME "mmal-blank-background"
+-#define MMAL_BLANK_BACKGROUND_TEXT N_("Blank screen below video.")
+-#define MMAL_BLANK_BACKGROUND_LONGTEXT N_("Render blank screen below video. " \
+- "Increases VideoCore load.")
++#define MMAL_DISPLAY_NAME "mmal-display"
++#define MMAL_DISPLAY_TEXT N_("Output device for Rpi fullscreen.")
++#define MMAL_DISPLAY_LONGTEXT N_("Output device for Rpi fullscreen. " \
++"Valid values are HDMI-1,HDMI-2. By default if qt-fullscreen-screennumber " \
++"is specified (or set by Fullscreen Output Device in Preferences) " \
++"HDMI-<qt-fullscreen-screennumber+1> will be used, otherwise HDMI-1.")
++
++#define MMAL_VOUT_TRANSFORM_NAME "mmal-vout-transform"
++#define MMAL_VOUT_TRANSFORM_TEXT N_("Video transform for Rpi fullscreen.")
++#define MMAL_VOUT_TRANSFORM_LONGTEXT N_("Video transform for Rpi fullscreen."\
++"Transforms availible: auto, 0, 90, 180, 270, hflip, vflip, transpose, antitranspose")
++
++#define MMAL_VOUT_WINDOW_NAME "mmal-vout-window"
++#define MMAL_VOUT_WINDOW_TEXT N_("Display window for Rpi fullscreen")
++#define MMAL_VOUT_WINDOW_LONGTEXT N_("Display window for Rpi fullscreen."\
++"fullscreen|<width>x<height>+<x>+<y>")
++
++#define MMAL_VOUT_TRANSPARENT_NAME "mmal-vout-transparent"
++#define MMAL_VOUT_TRANSPARENT_TEXT N_("Enable layers beneeth the vodeo layer.")
++#define MMAL_VOUT_TRANSPARENT_LONGTEXT N_("Enable layers beneath the video layer."\
++" By default these are disabled."\
++" Having the lower layers enabled can impact video performance")
+
+ #define MMAL_ADJUST_REFRESHRATE_NAME "mmal-adjust-refreshrate"
+ #define MMAL_ADJUST_REFRESHRATE_TEXT N_("Adjust HDMI refresh rate to the video.")
+@@ -68,332 +93,628 @@
+ #define PHASE_OFFSET_TARGET ((double)0.25)
+ #define PHASE_CHECK_INTERVAL 100
+
+-static int Open(vlc_object_t *);
+-static void Close(vlc_object_t *);
+-
+-vlc_module_begin()
+- set_shortname(N_("MMAL vout"))
+- set_description(N_("MMAL-based vout plugin for Raspberry Pi"))
+- set_capability("vout display", 90)
+- add_shortcut("mmal_vout")
+- add_integer(MMAL_LAYER_NAME, 1, MMAL_LAYER_TEXT, MMAL_LAYER_LONGTEXT, false)
+- add_bool(MMAL_BLANK_BACKGROUND_NAME, true, MMAL_BLANK_BACKGROUND_TEXT,
+- MMAL_BLANK_BACKGROUND_LONGTEXT, true);
+- add_bool(MMAL_ADJUST_REFRESHRATE_NAME, false, MMAL_ADJUST_REFRESHRATE_TEXT,
+- MMAL_ADJUST_REFRESHRATE_LONGTEXT, false)
+- add_bool(MMAL_NATIVE_INTERLACED, false, MMAL_NATIVE_INTERLACE_TEXT,
+- MMAL_NATIVE_INTERLACE_LONGTEXT, false)
+- set_callbacks(Open, Close)
+-vlc_module_end()
++#define SUBS_MAX 4
+
+-struct dmx_region_t {
+- struct dmx_region_t *next;
+- picture_t *picture;
+- VC_RECT_T bmp_rect;
+- VC_RECT_T src_rect;
+- VC_RECT_T dst_rect;
+- VC_DISPMANX_ALPHA_T alpha;
+- DISPMANX_ELEMENT_HANDLE_T element;
+- DISPMANX_RESOURCE_HANDLE_T resource;
+- int32_t pos_x;
+- int32_t pos_y;
+-};
++typedef struct vout_subpic_s {
++ MMAL_COMPONENT_T *component;
++ subpic_reg_stash_t sub;
++} vout_subpic_t;
+
+ struct vout_display_sys_t {
+- vlc_cond_t buffer_cond;
+- vlc_mutex_t buffer_mutex;
+ vlc_mutex_t manage_mutex;
+
+- plane_t planes[3]; /* Depending on video format up to 3 planes are used */
+- picture_t **pictures; /* Actual list of alloced pictures passed into picture_pool */
+- picture_pool_t *picture_pool;
+-
++ vcsm_init_type_t init_type;
+ MMAL_COMPONENT_T *component;
+ MMAL_PORT_T *input;
+ MMAL_POOL_T *pool; /* mmal buffer headers, used for pushing pictures to component*/
+- struct dmx_region_t *dmx_region;
+ int i_planes; /* Number of actually used planes, 1 for opaque, 3 for i420 */
+
+- uint32_t buffer_size; /* size of actual mmal buffers */
+ int buffers_in_transit; /* number of buffers currently pushed to mmal component */
+ unsigned num_buffers; /* number of buffers allocated at mmal port */
+
+- DISPMANX_DISPLAY_HANDLE_T dmx_handle;
+- DISPMANX_ELEMENT_HANDLE_T bkg_element;
+- DISPMANX_RESOURCE_HANDLE_T bkg_resource;
+- unsigned display_width;
+- unsigned display_height;
++ int display_id;
++ MMAL_RECT_T win_rect; // Window rect after transform(s)
++ MMAL_RECT_T display_rect; // Actual shape of display (x, y always 0)
++ MMAL_RECT_T req_win; // User requested window (w=0 => fullscreen)
++
++ MMAL_RECT_T spu_rect; // Output rectangle in cfg coords (for subpic placement)
++ MMAL_RECT_T dest_rect; // Output rectangle in display coords
++ MMAL_DISPLAYTRANSFORM_T dest_transform; // Dest window coord transform
++ MMAL_DISPLAYTRANSFORM_T display_transform; // "Native" display transform
++ MMAL_DISPLAYTRANSFORM_T video_transform; // Combined config+native transform
+
+- int i_frame_rate_base; /* cached framerate to detect changes for rate adjustment */
+- int i_frame_rate;
++ unsigned int i_frame_rate_base; /* cached framerate to detect changes for rate adjustment */
++ unsigned int i_frame_rate;
+
+ int next_phase_check; /* lowpass for phase check frequency */
+ int phase_offset; /* currently applied offset to presentation time in ns */
+ int layer; /* the dispman layer (z-index) used for video rendering */
++ bool transparent; // Do not disable layers beneath ours
+
+ bool need_configure_display; /* indicates a required display reconfigure to main thread */
+ bool adjust_refresh_rate;
+ bool native_interlaced;
+ bool b_top_field_first; /* cached interlaced settings to detect changes for native mode */
+ bool b_progressive;
+- bool opaque; /* indicated use of opaque picture format (zerocopy) */
+-};
++ bool force_config;
+
+-static const vlc_fourcc_t subpicture_chromas[] = {
+- VLC_CODEC_RGBA,
+- 0
+-};
++ vout_subpic_t subs[SUBS_MAX];
++ // Stash for subpics derived from the passed subpicture rather than
++ // included with the main pic
++ MMAL_BUFFER_HEADER_T * subpic_bufs[SUBS_MAX];
++
++ picture_pool_t * pic_pool;
++
++ struct vout_isp_conf_s {
++ MMAL_COMPONENT_T *component;
++ MMAL_PORT_T * input;
++ MMAL_PORT_T * output;
++ MMAL_QUEUE_T * out_q;
++ MMAL_POOL_T * in_pool;
++ MMAL_POOL_T * out_pool;
++ bool pending;
++ } isp;
+
+-/* Utility functions */
+-static inline uint32_t align(uint32_t x, uint32_t y);
+-static int configure_display(vout_display_t *vd, const vout_display_cfg_t *cfg,
+- const video_format_t *fmt);
++ MMAL_POOL_T * copy_pool;
++ MMAL_BUFFER_HEADER_T * copy_buf;
+
+-/* VLC vout display callbacks */
+-static picture_pool_t *vd_pool(vout_display_t *vd, unsigned count);
+-static void vd_prepare(vout_display_t *vd, picture_t *picture,
+- subpicture_t *subpicture);
+-static void vd_display(vout_display_t *vd, picture_t *picture,
+- subpicture_t *subpicture);
+-static int vd_control(vout_display_t *vd, int query, va_list args);
+-static void vd_manage(vout_display_t *vd);
+-
+-/* MMAL callbacks */
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer);
++ // Subpic blend if we have to do it here
++ vzc_pool_ctl_t * vzc;
++};
+
+-/* TV service */
+-static int query_resolution(vout_display_t *vd, unsigned *width, unsigned *height);
+-static void tvservice_cb(void *callback_data, uint32_t reason, uint32_t param1,
+- uint32_t param2);
+-static void adjust_refresh_rate(vout_display_t *vd, const video_format_t *fmt);
+-static int set_latency_target(vout_display_t *vd, bool enable);
+
+-/* DispManX */
+-static void display_subpicture(vout_display_t *vd, subpicture_t *subpicture);
+-static void close_dmx(vout_display_t *vd);
+-static struct dmx_region_t *dmx_region_new(vout_display_t *vd,
+- DISPMANX_UPDATE_HANDLE_T update, subpicture_region_t *region);
+-static void dmx_region_update(struct dmx_region_t *dmx_region,
+- DISPMANX_UPDATE_HANDLE_T update, picture_t *picture);
+-static void dmx_region_delete(struct dmx_region_t *dmx_region,
+- DISPMANX_UPDATE_HANDLE_T update);
+-static void show_background(vout_display_t *vd, bool enable);
+-static void maintain_phase_sync(vout_display_t *vd);
++// ISP setup
+
+-static int Open(vlc_object_t *object)
++static inline bool want_isp(const vout_display_t * const vd)
+ {
+- vout_display_t *vd = (vout_display_t *)object;
+- vout_display_sys_t *sys;
+- uint32_t buffer_pitch, buffer_height;
+- vout_display_place_t place;
+- MMAL_DISPLAYREGION_T display_region;
+- MMAL_STATUS_T status;
+- int ret = VLC_SUCCESS;
+- unsigned i;
++ return (vd->fmt.i_chroma == VLC_CODEC_MMAL_ZC_SAND10);
++}
+
+- if (vout_display_IsWindowed(vd))
+- return VLC_EGENERIC;
++static inline bool want_copy(const vout_display_t * const vd)
++{
++ return (vd->fmt.i_chroma == VLC_CODEC_I420 || vd->fmt.i_chroma == VLC_CODEC_I420_10L);
++}
+
+- sys = calloc(1, sizeof(struct vout_display_sys_t));
+- if (!sys)
+- return VLC_ENOMEM;
+- vd->sys = sys;
++static inline vlc_fourcc_t req_chroma(const vout_display_t * const vd)
++{
++ return !hw_mmal_chroma_is_mmal(vd->fmt.i_chroma) && !want_copy(vd) ?
++ VLC_CODEC_I420 :
++ vd->fmt.i_chroma;
++}
+
+- sys->layer = var_InheritInteger(vd, MMAL_LAYER_NAME);
+- bcm_host_init();
++static MMAL_FOURCC_T vout_vlc_to_mmal_pic_fourcc(const unsigned int fcc)
++{
++ switch (fcc){
++ case VLC_CODEC_MMAL_OPAQUE:
++ return MMAL_ENCODING_OPAQUE;
++ case VLC_CODEC_MMAL_ZC_SAND8:
++ return MMAL_ENCODING_YUVUV128;
++ case VLC_CODEC_MMAL_ZC_SAND10:
++ return MMAL_ENCODING_YUVUV64_10;
++ case VLC_CODEC_MMAL_ZC_SAND30:
++ return MMAL_ENCODING_YUV10_COL;
++ case VLC_CODEC_MMAL_ZC_I420:
++ case VLC_CODEC_I420:
++ return MMAL_ENCODING_I420;
++ default:
++ break;
++ }
++ return MMAL_ENCODING_I420;
++}
+
+- sys->opaque = vd->fmt.i_chroma == VLC_CODEC_MMAL_OPAQUE;
++static void display_set_format(const vout_display_t * const vd, MMAL_ES_FORMAT_T *const es_fmt, const bool is_intermediate)
++{
++ const unsigned int w = is_intermediate ? vd->fmt.i_visible_width : vd->fmt.i_width ;
++ const unsigned int h = is_intermediate ? vd->fmt.i_visible_height : vd->fmt.i_height;
++ MMAL_VIDEO_FORMAT_T * const v_fmt = &es_fmt->es->video;
+
+- status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
+- MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ es_fmt->type = MMAL_ES_TYPE_VIDEO;
++ es_fmt->encoding = is_intermediate ? MMAL_ENCODING_I420 : vout_vlc_to_mmal_pic_fourcc(vd->fmt.i_chroma);
++ es_fmt->encoding_variant = 0;
++
++ v_fmt->width = (w + 31) & ~31;
++ v_fmt->height = (h + 15) & ~15;
++ v_fmt->crop.x = 0;
++ v_fmt->crop.y = 0;
++ v_fmt->crop.width = w;
++ v_fmt->crop.height = h;
++ if (vd->fmt.i_sar_num == 0 || vd->fmt.i_sar_den == 0) {
++ v_fmt->par.num = 1;
++ v_fmt->par.den = 1;
++ } else {
++ v_fmt->par.num = vd->fmt.i_sar_num;
++ v_fmt->par.den = vd->fmt.i_sar_den;
+ }
++ v_fmt->frame_rate.num = vd->fmt.i_frame_rate;
++ v_fmt->frame_rate.den = vd->fmt.i_frame_rate_base;
++ v_fmt->color_space = vlc_to_mmal_color_space(vd->fmt.space);
+
+- sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
+- status = mmal_port_enable(sys->component->control, control_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to enable control port %s (status=%"PRIx32" %s)",
+- sys->component->control->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++ msg_Dbg(vd, "WxH: %dx%d, Crop: %dx%d", v_fmt->width, v_fmt->height, v_fmt->crop.width, v_fmt->crop.height);
++}
++
++static MMAL_RECT_T
++display_src_rect(const vout_display_t * const vd, const video_format_t * const src)
++{
++ const bool wants_isp = want_isp(vd);
++
++ // Scale source derived cropping to actual picture shape
++ return (MMAL_RECT_T){
++ .x = wants_isp ? 0 : src->i_x_offset * vd->fmt.i_width / src->i_width,
++ .y = wants_isp ? 0 : src->i_y_offset * vd->fmt.i_height / src->i_height,
++ .width = src->i_visible_width * vd->fmt.i_width / src->i_width,
++ .height = src->i_visible_height * vd->fmt.i_height / src->i_height
++ };
++}
++
++static void isp_input_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++#if TRACE_ALL
++ vout_display_t * const vd = (vout_display_t *)port->userdata;
++ pic_ctx_mmal_t * ctx = buf->user_data;
++ msg_Dbg(vd, "<<< %s: cmd=%d, ctx=%p, buf=%p, flags=%#x, pts=%lld", __func__, buf->cmd, ctx, buf,
++ buf->flags, (long long)buf->pts);
++#else
++ VLC_UNUSED(port);
++#endif
++
++ mmal_buffer_header_release(buf);
++
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s", __func__);
++#endif
++}
++
++static void isp_control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++{
++ vout_display_t *vd = (vout_display_t *)port->userdata;
++ MMAL_STATUS_T status;
++
++ if (buffer->cmd == MMAL_EVENT_ERROR) {
++ status = *(uint32_t *)buffer->data;
++ msg_Err(vd, "MMAL error %"PRIx32" \"%s\"", status, mmal_status_to_string(status));
+ }
+
+- sys->input = sys->component->input[0];
+- sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
++ mmal_buffer_header_release(buffer);
++}
+
+- if (sys->opaque) {
+- sys->input->format->encoding = MMAL_ENCODING_OPAQUE;
+- sys->i_planes = 1;
+- sys->buffer_size = sys->input->buffer_size_recommended;
+- } else {
+- sys->input->format->encoding = MMAL_ENCODING_I420;
+- vd->fmt.i_chroma = VLC_CODEC_I420;
+- buffer_pitch = align(vd->fmt.i_width, 32);
+- buffer_height = align(vd->fmt.i_height, 16);
+- sys->i_planes = 3;
+- sys->buffer_size = 3 * buffer_pitch * buffer_height / 2;
+- }
+-
+- sys->input->format->es->video.width = vd->fmt.i_width;
+- sys->input->format->es->video.height = vd->fmt.i_height;
+- sys->input->format->es->video.crop.x = 0;
+- sys->input->format->es->video.crop.y = 0;
+- sys->input->format->es->video.crop.width = vd->fmt.i_width;
+- sys->input->format->es->video.crop.height = vd->fmt.i_height;
+- sys->input->format->es->video.par.num = vd->source.i_sar_num;
+- sys->input->format->es->video.par.den = vd->source.i_sar_den;
++static void isp_output_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++ if (buf->cmd == 0 && buf->length != 0)
++ {
++ // The filter structure etc. should always exist if we have contents
++ // but might not on later flushes as we shut down
++ vout_display_t * const vd = (vout_display_t *)port->userdata;
++ struct vout_isp_conf_s *const isp = &vd->sys->isp;
+
+- status = mmal_port_format_commit(sys->input);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s: cmd=%d; flags=%#x, pts=%lld", __func__, buf->cmd, buf->flags, (long long) buf->pts);
++#endif
++ mmal_queue_put(isp->out_q, buf);
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s: out Q len=%d", __func__, mmal_queue_length(isp->out_q));
++#endif
+ }
+- sys->input->buffer_size = sys->input->buffer_size_recommended;
++ else
++ {
++ mmal_buffer_header_reset(buf);
++ mmal_buffer_header_release(buf);
++ }
++}
+
+- vout_display_PlacePicture(&place, &vd->source, vd->cfg, false);
+- display_region.hdr.id = MMAL_PARAMETER_DISPLAYREGION;
+- display_region.hdr.size = sizeof(MMAL_DISPLAYREGION_T);
+- display_region.fullscreen = MMAL_FALSE;
+- display_region.src_rect.x = vd->fmt.i_x_offset;
+- display_region.src_rect.y = vd->fmt.i_y_offset;
+- display_region.src_rect.width = vd->fmt.i_visible_width;
+- display_region.src_rect.height = vd->fmt.i_visible_height;
+- display_region.dest_rect.x = place.x;
+- display_region.dest_rect.y = place.y;
+- display_region.dest_rect.width = place.width;
+- display_region.dest_rect.height = place.height;
+- display_region.layer = sys->layer;
+- display_region.set = MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_SRC_RECT |
+- MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_LAYER;
+- status = mmal_port_parameter_set(sys->input, &display_region.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to set display region (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
+- ret = VLC_EGENERIC;
+- goto out;
++static void isp_empty_out_q(struct vout_isp_conf_s * const isp)
++{
++ MMAL_BUFFER_HEADER_T * buf;
++ // We can be called as part of error recovery so allow for missing Q
++ if (isp->out_q == NULL)
++ return;
++
++ while ((buf = mmal_queue_get(isp->out_q)) != NULL)
++ mmal_buffer_header_release(buf);
++}
++
++static void isp_flush(struct vout_isp_conf_s * const isp)
++{
++ if (!isp->input->is_enabled)
++ mmal_port_disable(isp->input);
++
++ if (isp->output->is_enabled)
++ mmal_port_disable(isp->output);
++
++ isp_empty_out_q(isp);
++ isp->pending = false;
++}
++
++static MMAL_STATUS_T isp_prepare(vout_display_t * const vd, struct vout_isp_conf_s * const isp)
++{
++ MMAL_STATUS_T err;
++ MMAL_BUFFER_HEADER_T * buf;
++
++ if (!isp->output->is_enabled) {
++ if ((err = mmal_port_enable(isp->output, isp_output_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "ISP output port enable failed");
++ return err;
++ }
+ }
+
+- for (i = 0; i < sys->i_planes; ++i) {
+- sys->planes[i].i_lines = buffer_height;
+- sys->planes[i].i_pitch = buffer_pitch;
+- sys->planes[i].i_visible_lines = vd->fmt.i_visible_height;
+- sys->planes[i].i_visible_pitch = vd->fmt.i_visible_width;
++ while ((buf = mmal_queue_get(isp->out_pool->queue)) != NULL) {
++ if ((err = mmal_port_send_buffer(isp->output, buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "ISP output port stuff failed");
++ return err;
++ }
++ }
+
+- if (i > 0) {
+- sys->planes[i].i_lines /= 2;
+- sys->planes[i].i_pitch /= 2;
+- sys->planes[i].i_visible_lines /= 2;
+- sys->planes[i].i_visible_pitch /= 2;
++ if (!isp->input->is_enabled) {
++ if ((err = mmal_port_enable(isp->input, isp_input_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "ISP input port enable failed");
++ return err;
+ }
+ }
++ return MMAL_SUCCESS;
++}
+
+- vlc_mutex_init(&sys->buffer_mutex);
+- vlc_cond_init(&sys->buffer_cond);
+- vlc_mutex_init(&sys->manage_mutex);
++static void isp_close(vout_display_t * const vd, vout_display_sys_t * const vd_sys)
++{
++ struct vout_isp_conf_s * const isp = &vd_sys->isp;
++ VLC_UNUSED(vd);
+
+- vd->pool = vd_pool;
+- vd->prepare = vd_prepare;
+- vd->display = vd_display;
+- vd->control = vd_control;
+- vd->manage = vd_manage;
++ if (isp->component == NULL)
++ return;
+
+- vc_tv_register_callback(tvservice_cb, vd);
++ isp_flush(isp);
+
+- if (query_resolution(vd, &sys->display_width, &sys->display_height) >= 0) {
+- vout_display_SendEventDisplaySize(vd, sys->display_width, sys->display_height);
+- } else {
+- sys->display_width = vd->cfg->display.width;
+- sys->display_height = vd->cfg->display.height;
++ if (isp->component->control->is_enabled)
++ mmal_port_disable(isp->component->control);
++
++ if (isp->out_q != NULL) {
++ // 1st junk anything lying around
++ isp_empty_out_q(isp);
++
++ mmal_queue_destroy(isp->out_q);
++ isp->out_q = NULL;
+ }
+
+- sys->dmx_handle = vc_dispmanx_display_open(0);
+- vd->info.subpicture_chromas = subpicture_chromas;
++ if (isp->out_pool != NULL) {
++ mmal_port_pool_destroy(isp->output, isp->out_pool);
++ isp->out_pool = NULL;
++ }
+
+- vout_display_DeleteWindow(vd, NULL);
++ isp->input = NULL;
++ isp->output = NULL;
+
+-out:
+- if (ret != VLC_SUCCESS)
+- Close(object);
++ mmal_component_release(isp->component);
++ isp->component = NULL;
+
+- return ret;
++ return;
+ }
+
+-static void Close(vlc_object_t *object)
++// Restuff into output rather than return to pool is we can
++static MMAL_BOOL_T isp_out_pool_cb(MMAL_POOL_T *pool, MMAL_BUFFER_HEADER_T *buffer, void *userdata)
+ {
+- vout_display_t *vd = (vout_display_t *)object;
+- vout_display_sys_t *sys = vd->sys;
+- char response[20]; /* answer is hvs_update_fields=%1d */
+- unsigned i;
++ struct vout_isp_conf_s * const isp = userdata;
++ VLC_UNUSED(pool);
++ if (isp->output->is_enabled) {
++ mmal_buffer_header_reset(buffer);
++ if (mmal_port_send_buffer(isp->output, buffer) == MMAL_SUCCESS)
++ return MMAL_FALSE;
++ }
++ return MMAL_TRUE;
++}
+
+- vc_tv_unregister_callback_full(tvservice_cb, vd);
++static MMAL_STATUS_T isp_setup(vout_display_t * const vd, vout_display_sys_t * const vd_sys)
++{
++ struct vout_isp_conf_s * const isp = &vd_sys->isp;
++ MMAL_STATUS_T err;
+
+- if (sys->dmx_handle)
+- close_dmx(vd);
++ if ((err = mmal_component_create(MMAL_COMPONENT_ISP_RESIZER, &isp->component)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Cannot create ISP component");
++ return err;
++ }
++ isp->input = isp->component->input[0];
++ isp->output = isp->component->output[0];
+
+- if (sys->component && sys->component->control->is_enabled)
+- mmal_port_disable(sys->component->control);
++ isp->component->control->userdata = (void *)vd;
++ if ((err = mmal_port_enable(isp->component->control, isp_control_port_cb)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable ISP control port");
++ goto fail;
++ }
+
+- if (sys->input && sys->input->is_enabled)
+- mmal_port_disable(sys->input);
++ isp->input->userdata = (void *)vd;
++ display_set_format(vd, isp->input->format, false);
+
+- if (sys->component && sys->component->is_enabled)
+- mmal_component_disable(sys->component);
++ if ((err = port_parameter_set_bool(isp->input, MMAL_PARAMETER_ZERO_COPY, true)) != MMAL_SUCCESS)
++ goto fail;
+
+- if (sys->pool)
+- mmal_port_pool_destroy(sys->input, sys->pool);
++ if ((err = mmal_port_format_commit(isp->input)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to set ISP input format");
++ goto fail;
++ }
+
+- if (sys->component)
+- mmal_component_release(sys->component);
++ isp->input->buffer_size = isp->input->buffer_size_recommended;
++ isp->input->buffer_num = 30;
+
+- if (sys->picture_pool)
+- picture_pool_Release(sys->picture_pool);
+- else
+- for (i = 0; i < sys->num_buffers; ++i)
+- if (sys->pictures[i]) {
+- mmal_buffer_header_release(sys->pictures[i]->p_sys->buffer);
+- picture_Release(sys->pictures[i]);
+- }
++ if ((isp->in_pool = mmal_pool_create(isp->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(vd, "Failed to create input pool");
++ goto fail;
++ }
+
+- vlc_mutex_destroy(&sys->buffer_mutex);
+- vlc_cond_destroy(&sys->buffer_cond);
+- vlc_mutex_destroy(&sys->manage_mutex);
++ if ((isp->out_q = mmal_queue_create()) == NULL)
++ {
++ err = MMAL_ENOMEM;
++ goto fail;
++ }
+
+- if (sys->native_interlaced) {
+- if (vc_gencmd(response, sizeof(response), "hvs_update_fields 0") < 0 ||
+- response[18] != '0')
+- msg_Warn(vd, "Could not reset hvs field mode");
++ display_set_format(vd, isp->output->format, true);
++
++ if ((err = port_parameter_set_bool(isp->output, MMAL_PARAMETER_ZERO_COPY, true)) != MMAL_SUCCESS)
++ goto fail;
++
++ if ((err = mmal_port_format_commit(isp->output)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to set ISP input format");
++ goto fail;
+ }
+
+- free(sys->pictures);
+- free(sys);
++ isp->output->buffer_size = isp->output->buffer_size_recommended;
++ isp->output->buffer_num = 2;
++ isp->output->userdata = (void *)vd;
++
++ if ((isp->out_pool = mmal_port_pool_create(isp->output, isp->output->buffer_num, isp->output->buffer_size)) == NULL)
++ {
++ msg_Err(vd, "Failed to make ISP port pool");
++ goto fail;
++ }
++
++ mmal_pool_callback_set(isp->out_pool, isp_out_pool_cb, isp);
++
++ if ((err = isp_prepare(vd, isp)) != MMAL_SUCCESS)
++ goto fail;
++
++ return MMAL_SUCCESS;
+
+- bcm_host_deinit();
++fail:
++ isp_close(vd, vd_sys);
++ return err;
+ }
+
+-static inline uint32_t align(uint32_t x, uint32_t y) {
+- uint32_t mod = x % y;
+- if (mod == 0)
+- return x;
++static MMAL_STATUS_T isp_check(vout_display_t * const vd, vout_display_sys_t * const vd_sys)
++{
++ struct vout_isp_conf_s *const isp = &vd_sys->isp;
++ const bool has_isp = (isp->component != NULL);
++ const bool wants_isp = want_isp(vd);
++
++ if (has_isp == wants_isp)
++ {
++ // All OK - do nothing
++ }
++ else if (has_isp)
++ {
++ // ISP active but we don't want it
++ isp_flush(isp);
++
++ // Check we have everything back and then kill it
++ if (mmal_queue_length(isp->out_pool->queue) == isp->output->buffer_num)
++ isp_close(vd, vd_sys);
++ }
+ else
+- return x + y - mod;
++ {
++ // ISP closed but we want it
++ return isp_setup(vd, vd_sys);
++ }
++
++ return MMAL_SUCCESS;
++}
++
++/* TV service */
++static void tvservice_cb(void *callback_data, uint32_t reason, uint32_t param1,
++ uint32_t param2);
++static void adjust_refresh_rate(vout_display_t *vd, const video_format_t *fmt);
++static int set_latency_target(vout_display_t *vd, bool enable);
++
++// Mmal
++static void maintain_phase_sync(vout_display_t *vd);
++
++
++
++static void vd_input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf)
++{
++#if TRACE_ALL
++ vout_display_t * const vd = (vout_display_t *)port->userdata;
++ pic_ctx_mmal_t * ctx = buf->user_data;
++ msg_Dbg(vd, "<<< %s: cmd=%d, ctx=%p, buf=%p, flags=%#x, pts=%lld", __func__, buf->cmd, ctx, buf,
++ buf->flags, (long long)buf->pts);
++#else
++ VLC_UNUSED(port);
++#endif
++
++ mmal_buffer_header_release(buf);
++
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s", __func__);
++#endif
++}
++
++static int query_resolution(vout_display_t *vd, const int display_id, unsigned *width, unsigned *height)
++{
++ TV_DISPLAY_STATE_T display_state = {0};
++ int ret = 0;
++
++ if (vc_tv_get_display_state_id(display_id, &display_state) == 0) {
++ msg_Dbg(vd, "State=%#x", display_state.state);
++ if (display_state.state & 0xFF) {
++ msg_Dbg(vd, "HDMI: %dx%d", display_state.display.hdmi.width, display_state.display.hdmi.height);
++ *width = display_state.display.hdmi.width;
++ *height = display_state.display.hdmi.height;
++ } else if (display_state.state & 0xFF00) {
++ msg_Dbg(vd, "SDTV: %dx%d", display_state.display.sdtv.width, display_state.display.sdtv.height);
++ *width = display_state.display.sdtv.width;
++ *height = display_state.display.sdtv.height;
++ } else {
++ msg_Warn(vd, "Invalid display state %"PRIx32, display_state.state);
++ ret = -1;
++ }
++ } else {
++ msg_Warn(vd, "Failed to query display resolution");
++ ret = -1;
++ }
++
++ return ret;
++}
++
++static inline MMAL_RECT_T
++place_to_mmal_rect(const vout_display_place_t place)
++{
++ return (MMAL_RECT_T){
++ .x = place.x,
++ .y = place.y,
++ .width = place.width,
++ .height = place.height
++ };
++}
++
++static MMAL_RECT_T
++place_out(const vout_display_cfg_t * cfg,
++ const video_format_t * fmt,
++ const MMAL_RECT_T r)
++{
++ video_format_t tfmt;
++ vout_display_cfg_t tcfg;
++ vout_display_place_t place;
++
++ // Fix SAR if unknown
++ if (fmt->i_sar_den == 0 || fmt->i_sar_num == 0) {
++ tfmt = *fmt;
++ tfmt.i_sar_den = 1;
++ tfmt.i_sar_num = 1;
++ fmt = &tfmt;
++ }
++
++ // Override what VLC thinks might be going on with display size
++ // if we know better
++ if (r.width != 0 && r.height != 0)
++ {
++ tcfg = *cfg;
++ tcfg.display.width = r.width;
++ tcfg.display.height = r.height;
++ cfg = &tcfg;
++ }
++
++ vout_display_PlacePicture(&place, fmt, cfg, false);
++
++ place.x += r.x;
++ place.y += r.y;
++
++ return place_to_mmal_rect(place);
++}
++
++static MMAL_RECT_T
++rect_transform(MMAL_RECT_T s, const MMAL_RECT_T c, const MMAL_DISPLAYTRANSFORM_T t)
++{
++ if (is_transform_transpose(t))
++ s = rect_transpose(s);
++ if (is_transform_hflip(t))
++ s = rect_hflip(s, c);
++ if (is_transform_vflip(t) != 0)
++ s = rect_vflip(s, c);
++ return s;
++}
++
++static void
++place_dest_rect(vout_display_t * const vd,
++ const vout_display_cfg_t * const cfg,
++ const video_format_t * fmt)
++{
++ vout_display_sys_t * const sys = vd->sys;
++ sys->dest_rect = rect_transform(place_out(cfg, fmt, sys->win_rect),
++ sys->display_rect, sys->dest_transform);
++}
++
++static void
++place_spu_rect(vout_display_t * const vd,
++ const vout_display_cfg_t * const cfg,
++ const video_format_t * fmt)
++{
++ vout_display_sys_t * const sys = vd->sys;
++ static const MMAL_RECT_T r0 = {0};
++
++ sys->spu_rect = place_out(cfg, fmt, r0);
++ sys->spu_rect.x = 0;
++ sys->spu_rect.y = 0;
++
++ // Copy place override logic for spu pos from video_output.c
++ // This info doesn't appear to reside anywhere natively
++
++ if (fmt->i_width * fmt->i_height >= (unsigned int)(sys->spu_rect.width * sys->spu_rect.height)) {
++ sys->spu_rect.width = fmt->i_visible_width;
++ sys->spu_rect.height = fmt->i_visible_height;
++ }
++
++ if (ORIENT_IS_SWAP(fmt->orientation))
++ sys->spu_rect = rect_transpose(sys->spu_rect);
++}
++
++static void
++place_rects(vout_display_t * const vd,
++ const vout_display_cfg_t * const cfg,
++ const video_format_t * fmt)
++{
++ place_dest_rect(vd, cfg, fmt);
++ place_spu_rect(vd, cfg, fmt);
++}
++
++static int
++set_input_region(vout_display_t * const vd, const video_format_t * const fmt)
++{
++ const vout_display_sys_t * const sys = vd->sys;
++ MMAL_DISPLAYREGION_T display_region = {
++ .hdr = {
++ .id = MMAL_PARAMETER_DISPLAYREGION,
++ .size = sizeof(MMAL_DISPLAYREGION_T)
++ },
++ .display_num = sys->display_id,
++ .fullscreen = MMAL_FALSE,
++ .transform = sys->video_transform,
++ .dest_rect = sys->dest_rect,
++ .src_rect = display_src_rect(vd, fmt),
++ .noaspect = MMAL_TRUE,
++ .mode = MMAL_DISPLAY_MODE_FILL,
++ .layer = sys->layer,
++ .alpha = 0xff | (sys->transparent ? 0 : (1 << 29)),
++ .set =
++ MMAL_DISPLAY_SET_NUM |
++ MMAL_DISPLAY_SET_FULLSCREEN |
++ MMAL_DISPLAY_SET_TRANSFORM |
++ MMAL_DISPLAY_SET_DEST_RECT |
++ MMAL_DISPLAY_SET_SRC_RECT |
++ MMAL_DISPLAY_SET_NOASPECT |
++ MMAL_DISPLAY_SET_MODE |
++ MMAL_DISPLAY_SET_LAYER |
++ MMAL_DISPLAY_SET_ALPHA
++ };
++ MMAL_STATUS_T status = mmal_port_parameter_set(sys->input, &display_region.hdr);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to set display region (status=%"PRIx32" %s)",
++ status, mmal_status_to_string(status));
++ return -EINVAL;
++ }
++ return 0;
+ }
+
+ static int configure_display(vout_display_t *vd, const vout_display_cfg_t *cfg,
+ const video_format_t *fmt)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- vout_display_place_t place;
+- MMAL_DISPLAYREGION_T display_region;
++ vout_display_sys_t * const sys = vd->sys;
+ MMAL_STATUS_T status;
+
+ if (!cfg && !fmt)
++ {
++ msg_Err(vd, "%s: Missing cfg & fmt", __func__);
+ return -EINVAL;
++ }
++
++ isp_check(vd, sys);
+
+ if (fmt) {
+ sys->input->format->es->video.par.num = fmt->i_sar_num;
+@@ -412,30 +733,14 @@ static int configure_display(vout_displa
+ if (!cfg)
+ cfg = vd->cfg;
+
+- vout_display_PlacePicture(&place, fmt, cfg, false);
++ sys->video_transform = combine_transform(
++ vlc_to_mmal_transform(fmt->orientation), sys->display_transform);
+
+- display_region.hdr.id = MMAL_PARAMETER_DISPLAYREGION;
+- display_region.hdr.size = sizeof(MMAL_DISPLAYREGION_T);
+- display_region.fullscreen = MMAL_FALSE;
+- display_region.src_rect.x = fmt->i_x_offset;
+- display_region.src_rect.y = fmt->i_y_offset;
+- display_region.src_rect.width = fmt->i_visible_width;
+- display_region.src_rect.height = fmt->i_visible_height;
+- display_region.dest_rect.x = place.x;
+- display_region.dest_rect.y = place.y;
+- display_region.dest_rect.width = place.width;
+- display_region.dest_rect.height = place.height;
+- display_region.layer = sys->layer;
+- display_region.set = MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_SRC_RECT |
+- MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_LAYER;
+- status = mmal_port_parameter_set(sys->input, &display_region.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to set display region (status=%"PRIx32" %s)",
+- status, mmal_status_to_string(status));
++ place_rects(vd, cfg, fmt);
++
++ if (set_input_region(vd, fmt) != 0)
+ return -EINVAL;
+- }
+
+- show_background(vd, var_InheritBool(vd, MMAL_BLANK_BACKGROUND_NAME));
+ sys->adjust_refresh_rate = var_InheritBool(vd, MMAL_ADJUST_REFRESHRATE_NAME);
+ sys->native_interlaced = var_InheritBool(vd, MMAL_NATIVE_INTERLACED);
+ if (sys->adjust_refresh_rate) {
+@@ -446,204 +751,217 @@ static int configure_display(vout_displa
+ return 0;
+ }
+
++static void kill_pool(vout_display_sys_t * const sys)
++{
++ if (sys->pic_pool != NULL) {
++ picture_pool_Release(sys->pic_pool);
++ sys->pic_pool = NULL;
++ }
++}
++
++// Actual picture pool for MMAL opaques is just a set of trivial containers
+ static picture_pool_t *vd_pool(vout_display_t *vd, unsigned count)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- picture_resource_t picture_res;
+- picture_pool_configuration_t picture_pool_cfg;
+- video_format_t fmt = vd->fmt;
+- MMAL_STATUS_T status;
+- unsigned i;
++ vout_display_sys_t * const sys = vd->sys;
+
+- if (sys->picture_pool) {
+- if (sys->num_buffers < count)
+- msg_Warn(vd, "Picture pool with %u pictures requested, but we already have one with %u pictures",
+- count, sys->num_buffers);
++ msg_Dbg(vd, "%s: fmt:%dx%d,sar:%d/%d; source:%dx%d", __func__,
++ vd->fmt.i_width, vd->fmt.i_height, vd->fmt.i_sar_num, vd->fmt.i_sar_den, vd->source.i_width, vd->source.i_height);
+
+- goto out;
++ if (sys->pic_pool == NULL) {
++ sys->pic_pool = picture_pool_NewFromFormat(&vd->fmt, count);
+ }
++ return sys->pic_pool;
++}
+
+- if (sys->opaque) {
+- if (count <= NUM_ACTUAL_OPAQUE_BUFFERS)
+- count = NUM_ACTUAL_OPAQUE_BUFFERS;
++static inline bool
++check_shape(vout_display_t * const vd, const picture_t * const p_pic)
++{
++ if (vd->fmt.i_width == p_pic->format.i_width &&
++ vd->fmt.i_height == p_pic->format.i_height)
++ return true;
++ return false;
++}
+
+- MMAL_PARAMETER_BOOLEAN_T zero_copy = {
+- { MMAL_PARAMETER_ZERO_COPY, sizeof(MMAL_PARAMETER_BOOLEAN_T) },
+- 1
+- };
++static void vd_display(vout_display_t *vd, picture_t *p_pic,
++ subpicture_t *subpicture)
++{
++ vout_display_sys_t * const sys = vd->sys;
++ MMAL_STATUS_T err;
+
+- status = mmal_port_parameter_set(sys->input, &zero_copy.hdr);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- goto out;
+- }
++#if TRACE_ALL
++ {
++ char dbuf0[5];
++ msg_Dbg(vd, "<<< %s: %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d -> %dx%d@%d,%d", __func__,
++ str_fourcc(dbuf0, p_pic->format.i_chroma), p_pic->format.i_width, p_pic->format.i_height,
++ p_pic->format.i_x_offset, p_pic->format.i_y_offset,
++ p_pic->format.i_visible_width, p_pic->format.i_visible_height,
++ p_pic->format.i_sar_num, p_pic->format.i_sar_den,
++ sys->dest_rect.width, sys->dest_rect.height, sys->dest_rect.x, sys->dest_rect.y);
+ }
+-
+- if (count < sys->input->buffer_num_recommended)
+- count = sys->input->buffer_num_recommended;
+-
+-#ifndef NDEBUG
+- msg_Dbg(vd, "Creating picture pool with %u pictures", count);
+ #endif
+
+- sys->input->buffer_num = count;
+- status = mmal_port_enable(sys->input, input_port_cb);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to enable input port %s (status=%"PRIx32" %s)",
+- sys->input->name, status, mmal_status_to_string(status));
+- goto out;
++ // If we had subpics then we have attached them to the main pic in prepare
++ // so all we have to do here is delete the refs
++ if (subpicture != NULL) {
++ subpicture_Delete(subpicture);
+ }
+
+- status = mmal_component_enable(sys->component);
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to enable component %s (status=%"PRIx32" %s)",
+- sys->component->name, status, mmal_status_to_string(status));
+- goto out;
++ if (!check_shape(vd, p_pic))
++ {
++ msg_Err(vd, "Pic/fmt shape mismatch");
++ goto fail;
++ }
++
++ if (!sys->input->is_enabled &&
++ (err = mmal_port_enable(sys->input, vd_input_port_cb)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "Input port enable failed");
++ goto fail;
++ }
++ // Stuff into input
++ // We assume the BH is already set up with values reflecting pic date etc.
++ if (sys->copy_buf != NULL) {
++ MMAL_BUFFER_HEADER_T *const buf = sys->copy_buf;
++ sys->copy_buf = NULL;
++#if TRACE_ALL
++ msg_Dbg(vd, "--- %s: Copy stuff", __func__);
++#endif
++ if (mmal_port_send_buffer(sys->input, buf) != MMAL_SUCCESS)
++ {
++ mmal_buffer_header_release(buf);
++ msg_Err(vd, "Send copy buffer to render input failed");
++ goto fail;
++ }
+ }
+-
+- sys->num_buffers = count;
+- sys->pool = mmal_port_pool_create(sys->input, sys->num_buffers,
+- sys->input->buffer_size);
+- if (!sys->pool) {
+- msg_Err(vd, "Failed to create MMAL pool for %u buffers of size %"PRIu32,
+- count, sys->input->buffer_size);
+- goto out;
++ else if (sys->isp.pending) {
++ MMAL_BUFFER_HEADER_T *const buf = mmal_queue_wait(sys->isp.out_q);
++ sys->isp.pending = false;
++#if TRACE_ALL
++ msg_Dbg(vd, "--- %s: ISP stuff", __func__);
++#endif
++ if (mmal_port_send_buffer(sys->input, buf) != MMAL_SUCCESS)
++ {
++ mmal_buffer_header_release(buf);
++ msg_Err(vd, "Send ISP buffer to render input failed");
++ goto fail;
++ }
+ }
+-
+- memset(&picture_res, 0, sizeof(picture_resource_t));
+- sys->pictures = calloc(sys->num_buffers, sizeof(picture_t *));
+- for (i = 0; i < sys->num_buffers; ++i) {
+- picture_res.p_sys = calloc(1, sizeof(picture_sys_t));
+- picture_res.p_sys->owner = (vlc_object_t *)vd;
+- picture_res.p_sys->buffer = mmal_queue_get(sys->pool->queue);
+-
+- sys->pictures[i] = picture_NewFromResource(&fmt, &picture_res);
+- if (!sys->pictures[i]) {
+- msg_Err(vd, "Failed to create picture");
+- free(picture_res.p_sys);
+- goto out;
++ else
++ {
++ MMAL_BUFFER_HEADER_T *const pic_buf = hw_mmal_pic_buf_replicated(p_pic, sys->pool);
++ if (pic_buf == NULL)
++ {
++ msg_Err(vd, "Replicated buffer get fail");
++ goto fail;
+ }
+
+- sys->pictures[i]->i_planes = sys->i_planes;
+- memcpy(sys->pictures[i]->p, sys->planes, sys->i_planes * sizeof(plane_t));
+- }
+
+- memset(&picture_pool_cfg, 0, sizeof(picture_pool_configuration_t));
+- picture_pool_cfg.picture_count = sys->num_buffers;
+- picture_pool_cfg.picture = sys->pictures;
+- picture_pool_cfg.lock = mmal_picture_lock;
++ // If dimensions have chnaged then fix that
++ if (hw_mmal_vlc_pic_to_mmal_fmt_update(sys->input->format, p_pic))
++ {
++ msg_Dbg(vd, "Reset port format");
++
++ // HVS can deal with on-line dimension changes
++ if (mmal_port_format_commit(sys->input) != MMAL_SUCCESS)
++ msg_Warn(vd, "Input format commit failed");
++ }
+
+- sys->picture_pool = picture_pool_NewExtended(&picture_pool_cfg);
+- if (!sys->picture_pool) {
+- msg_Err(vd, "Failed to create picture pool");
+- goto out;
++ if ((err = mmal_port_send_buffer(sys->input, pic_buf)) != MMAL_SUCCESS)
++ {
++ mmal_buffer_header_release(pic_buf);
++ msg_Err(vd, "Send buffer to input failed");
++ goto fail;
++ }
+ }
+
+-out:
+- return sys->picture_pool;
+-}
+-
+-static void vd_prepare(vout_display_t *vd, picture_t *picture,
+- subpicture_t *subpicture)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- picture_sys_t *pic_sys = picture->p_sys;
+-
+- if (!sys->adjust_refresh_rate || pic_sys->displayed)
+- return;
+-
+- /* Apply the required phase_offset to the picture, so that vd_display()
+- * will be called at the corrected time from the core */
+- picture->date += sys->phase_offset;
+-}
+-
+-static void vd_display(vout_display_t *vd, picture_t *picture,
+- subpicture_t *subpicture)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- picture_sys_t *pic_sys = picture->p_sys;
+- MMAL_BUFFER_HEADER_T *buffer = pic_sys->buffer;
+- MMAL_STATUS_T status;
+-
+- if (picture->format.i_frame_rate != sys->i_frame_rate ||
+- picture->format.i_frame_rate_base != sys->i_frame_rate_base ||
+- picture->b_progressive != sys->b_progressive ||
+- picture->b_top_field_first != sys->b_top_field_first) {
+- sys->b_top_field_first = picture->b_top_field_first;
+- sys->b_progressive = picture->b_progressive;
+- sys->i_frame_rate = picture->format.i_frame_rate;
+- sys->i_frame_rate_base = picture->format.i_frame_rate_base;
+- configure_display(vd, NULL, &picture->format);
+- }
+-
+- if (!pic_sys->displayed || !sys->opaque) {
+- buffer->cmd = 0;
+- buffer->length = sys->input->buffer_size;
+- buffer->user_data = picture;
+-
+- status = mmal_port_send_buffer(sys->input, buffer);
+- if (status == MMAL_SUCCESS)
+- atomic_fetch_add(&sys->buffers_in_transit, 1);
+-
+- if (status != MMAL_SUCCESS) {
+- msg_Err(vd, "Failed to send buffer to input port. Frame dropped");
+- picture_Release(picture);
++ {
++ unsigned int sub_no = 0;
++ MMAL_BUFFER_HEADER_T **psub_bufs2 = sys->subpic_bufs;
++ const bool is_mmal_pic = hw_mmal_pic_is_mmal(p_pic);
++
++ for (sub_no = 0; sub_no != SUBS_MAX; ++sub_no) {
++ int rv;
++ MMAL_BUFFER_HEADER_T * const sub_buf = !is_mmal_pic ? NULL :
++ hw_mmal_pic_sub_buf_get(p_pic, sub_no);
++
++ if ((rv = hw_mmal_subpic_update(VLC_OBJECT(vd),
++ sub_buf != NULL ? sub_buf : *psub_bufs2++,
++ &sys->subs[sub_no].sub,
++ &p_pic->format,
++ &sys->dest_rect,
++ sys->display_transform,
++ p_pic->date)) == 0)
++ break;
++ else if (rv < 0)
++ goto fail;
+ }
+-
+- pic_sys->displayed = true;
+- } else {
+- picture_Release(picture);
+ }
+
+- display_subpicture(vd, subpicture);
++fail:
++ for (unsigned int i = 0; i != SUBS_MAX && sys->subpic_bufs[i] != NULL; ++i) {
++ mmal_buffer_header_release(sys->subpic_bufs[i]);
++ sys->subpic_bufs[i] = NULL;
++ }
+
+- if (subpicture)
+- subpicture_Delete(subpicture);
++ picture_Release(p_pic);
+
+ if (sys->next_phase_check == 0 && sys->adjust_refresh_rate)
+ maintain_phase_sync(vd);
+ sys->next_phase_check = (sys->next_phase_check + 1) % PHASE_CHECK_INTERVAL;
+-
+- if (sys->opaque) {
+- vlc_mutex_lock(&sys->buffer_mutex);
+- while (atomic_load(&sys->buffers_in_transit) >= MAX_BUFFERS_IN_TRANSIT)
+- vlc_cond_wait(&sys->buffer_cond, &sys->buffer_mutex);
+- vlc_mutex_unlock(&sys->buffer_mutex);
+- }
+ }
+
+ static int vd_control(vout_display_t *vd, int query, va_list args)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- vout_display_cfg_t cfg;
+- const vout_display_cfg_t *tmp_cfg;
++ vout_display_sys_t * const sys = vd->sys;
+ int ret = VLC_EGENERIC;
++ VLC_UNUSED(args);
+
+ switch (query) {
+- case VOUT_DISPLAY_CHANGE_DISPLAY_SIZE:
+- tmp_cfg = va_arg(args, const vout_display_cfg_t *);
+- if (tmp_cfg->display.width == sys->display_width &&
+- tmp_cfg->display.height == sys->display_height) {
+- cfg = *vd->cfg;
+- cfg.display.width = sys->display_width;
+- cfg.display.height = sys->display_height;
+- if (configure_display(vd, &cfg, NULL) >= 0)
+- ret = VLC_SUCCESS;
+- }
+- break;
+-
+ case VOUT_DISPLAY_CHANGE_SOURCE_ASPECT:
+ case VOUT_DISPLAY_CHANGE_SOURCE_CROP:
+- if (configure_display(vd, NULL, &vd->source) >= 0)
++ if (configure_display(vd, vd->cfg, &vd->source) >= 0)
+ ret = VLC_SUCCESS;
+ break;
+
+- case VOUT_DISPLAY_RESET_PICTURES:
+- vlc_assert_unreachable();
+ case VOUT_DISPLAY_CHANGE_ZOOM:
+- msg_Warn(vd, "Unsupported control query %d", query);
++ case VOUT_DISPLAY_CHANGE_DISPLAY_SIZE:
++ case VOUT_DISPLAY_CHANGE_DISPLAY_FILLED:
++ {
++ const vout_display_cfg_t * const cfg = va_arg(args, const vout_display_cfg_t *);
++
++ if (configure_display(vd, cfg, &vd->source) >= 0)
++ ret = VLC_SUCCESS;
++ break;
++ }
++
++ case VOUT_DISPLAY_RESET_PICTURES:
++ msg_Warn(vd, "Reset Pictures");
++ kill_pool(sys);
++ vd->fmt = vd->source; // Take (nearly) whatever source wants to give us
++ vd->fmt.i_chroma = req_chroma(vd); // Adjust chroma to something we can actaully deal with
++ ret = VLC_SUCCESS;
++ break;
++
++ case VOUT_DISPLAY_CHANGE_MMAL_HIDE:
++ {
++ MMAL_STATUS_T err;
++ unsigned int i;
++
++ msg_Dbg(vd, "Hide display");
++
++ for (i = 0; i != SUBS_MAX; ++i)
++ hw_mmal_subpic_flush(VLC_OBJECT(vd), &sys->subs[i].sub);
++
++ if (sys->input->is_enabled &&
++ (err = mmal_port_disable(sys->input)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "Unable to disable port: err=%d", err);
++ break;
++ }
++ sys->force_config = true;
++ ret = VLC_SUCCESS;
+ break;
++ }
+
+ default:
+ msg_Warn(vd, "Unknown control query %d", query);
+@@ -653,79 +971,207 @@ static int vd_control(vout_display_t *vd
+ return ret;
+ }
+
++static void set_display_windows(vout_display_t *const vd, vout_display_sys_t *const sys)
++{
++ unsigned int width, height;
++ if (query_resolution(vd, sys->display_id, &width, &height) < 0) {
++ width = vd->cfg->display.width;
++ height = vd->cfg->display.height;
++ }
++ sys->display_rect = (MMAL_RECT_T){0, 0, width, height};
++
++ sys->win_rect = (sys->req_win.width != 0) ?
++ sys->req_win :
++ is_transform_transpose(sys->display_transform) ?
++ rect_transpose(sys->display_rect) : sys->display_rect;
++}
++
+ static void vd_manage(vout_display_t *vd)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- unsigned width, height;
++ vout_display_sys_t *const sys = vd->sys;
+
+ vlc_mutex_lock(&sys->manage_mutex);
+
+ if (sys->need_configure_display) {
+- close_dmx(vd);
+- sys->dmx_handle = vc_dispmanx_display_open(0);
+-
+- if (query_resolution(vd, &width, &height) >= 0) {
+- sys->display_width = width;
+- sys->display_height = height;
+- vout_display_SendEventDisplaySize(vd, width, height);
+- }
+-
+ sys->need_configure_display = false;
++ set_display_windows(vd, sys);
+ }
+
+ vlc_mutex_unlock(&sys->manage_mutex);
+ }
+
+-static void control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++
++static int attach_subpics(vout_display_t * const vd, vout_display_sys_t * const sys,
++ subpicture_t * const subpicture)
+ {
+- vout_display_t *vd = (vout_display_t *)port->userdata;
+- MMAL_STATUS_T status;
++ unsigned int n = 0;
+
+- if (buffer->cmd == MMAL_EVENT_ERROR) {
+- status = *(uint32_t *)buffer->data;
+- msg_Err(vd, "MMAL error %"PRIx32" \"%s\"", status, mmal_status_to_string(status));
++ if (sys->vzc == NULL) {
++ if ((sys->vzc = hw_mmal_vzc_pool_new()) == NULL)
++ {
++ msg_Err(vd, "Failed to allocate VZC");
++ return VLC_ENOMEM;
++ }
+ }
+
+- mmal_buffer_header_release(buffer);
++ // Attempt to import the subpics
++ for (subpicture_t * spic = subpicture; spic != NULL; spic = spic->p_next)
++ {
++ for (subpicture_region_t *sreg = spic->p_region; sreg != NULL; sreg = sreg->p_next) {
++ picture_t *const src = sreg->p_picture;
++
++#if TRACE_ALL
++ char dbuf0[5];
++ msg_Dbg(vd, " [%p:%p] Pos=%d,%d max=%dx%d, src=%dx%d/%dx%d o:%d, spu=%d,%d:%dx%d, vd->fmt=%dx%d/%dx%d, vd->source=%dx%d/%dx%d, cfg=%dx%d, zoom=%d/%d, Alpha=%d, Fmt=%s", src, src->p[0].p_pixels,
++ sreg->i_x, sreg->i_y,
++ sreg->i_max_width, sreg->i_max_height,
++ src->format.i_visible_width, src->format.i_visible_height,
++ src->format.i_width, src->format.i_height,
++ src->format.orientation,
++ sys->spu_rect.x, sys->spu_rect.y, sys->spu_rect.width, sys->spu_rect.height,
++ vd->fmt.i_visible_width, vd->fmt.i_visible_height,
++ vd->fmt.i_width, vd->fmt.i_height,
++ vd->source.i_visible_width, vd->source.i_visible_height,
++ vd->source.i_width, vd->source.i_height,
++ vd->cfg->display.width, vd->cfg->display.height,
++ vd->cfg->zoom.num, vd->cfg->zoom.den,
++ sreg->i_alpha,
++ str_fourcc(dbuf0, src->format.i_chroma));
++#endif
++
++ // At this point I think the subtitles are being placed in the
++ // coord space of the placed rectangle in the cfg display space
++ if ((sys->subpic_bufs[n] = hw_mmal_vzc_buf_from_pic(sys->vzc,
++ src,
++ (MMAL_RECT_T){.width = sys->spu_rect.width, .height=sys->spu_rect.height},
++ sreg->i_x, sreg->i_y,
++ sreg->i_alpha,
++ n == 0)) == NULL)
++ {
++ msg_Err(vd, "Failed to allocate vzc buffer for subpic");
++ return VLC_ENOMEM;
++ }
++
++ if (++n == SUBS_MAX)
++ return VLC_SUCCESS;
++ }
++ }
++ return VLC_SUCCESS;
+ }
+
+-static void input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
++
++static void vd_prepare(vout_display_t *vd, picture_t *p_pic,
++#if VLC_VER_3
++ subpicture_t *subpicture
++#else
++ subpicture_t *subpicture, vlc_tick_t date
++#endif
++ )
+ {
+- vout_display_t *vd = (vout_display_t *)port->userdata;
++ MMAL_STATUS_T err;
++ vout_display_sys_t * const sys = vd->sys;
++
++ vd_manage(vd);
++
++ if (!check_shape(vd, p_pic))
++ return;
++
++ if (sys->force_config ||
++ p_pic->format.i_frame_rate != sys->i_frame_rate ||
++ p_pic->format.i_frame_rate_base != sys->i_frame_rate_base ||
++ p_pic->b_progressive != sys->b_progressive ||
++ p_pic->b_top_field_first != sys->b_top_field_first)
++ {
++ sys->force_config = false;
++ sys->b_top_field_first = p_pic->b_top_field_first;
++ sys->b_progressive = p_pic->b_progressive;
++ sys->i_frame_rate = p_pic->format.i_frame_rate;
++ sys->i_frame_rate_base = p_pic->format.i_frame_rate_base;
++ configure_display(vd, NULL, &vd->source);
++ }
++
++ // Subpics can either turn up attached to the main pic or in the
++ // subpic list here - if they turn up here then process into temp
++ // buffers
++ if (subpicture != NULL) {
++ attach_subpics(vd, sys, subpicture);
++ }
++
++ // *****
++ if (want_copy(vd)) {
++ if (sys->copy_buf != NULL) {
++ msg_Err(vd, "Copy buf not NULL");
++ mmal_buffer_header_release(sys->copy_buf);
++ sys->copy_buf = NULL;
++ }
++
++ MMAL_BUFFER_HEADER_T * const buf = mmal_queue_wait(sys->copy_pool->queue);
++ // Copy 2d
++ hw_mmal_copy_pic_to_buf(buf->data, &buf->length, sys->input->format, p_pic);
++ buf->flags = MMAL_BUFFER_HEADER_FLAG_FRAME_END;
++
++ sys->copy_buf = buf;
++ }
++
++ if (isp_check(vd, sys) != MMAL_SUCCESS) {
++ return;
++ }
++
++ if (want_isp(vd))
++ {
++ struct vout_isp_conf_s * const isp = &sys->isp;
++ MMAL_BUFFER_HEADER_T * buf;
++
++ // This should be empty - make it so if it isn't
++ isp_empty_out_q(isp);
++ isp->pending = false;
++
++ // Stuff output
++ if (isp_prepare(vd, isp) != MMAL_SUCCESS)
++ return;
++
++ if ((buf = hw_mmal_pic_buf_replicated(p_pic, isp->in_pool)) == NULL)
++ {
++ msg_Err(vd, "Pic has no attached buffer");
++ return;
++ }
++
++ if ((err = mmal_port_send_buffer(isp->input, buf)) != MMAL_SUCCESS)
++ {
++ msg_Err(vd, "Send buffer to input failed");
++ mmal_buffer_header_release(buf);
++ return;
++ }
++
++ isp->pending = true;
++ }
++
++#if 0
++ VLC_UNUSED(date);
+ vout_display_sys_t *sys = vd->sys;
+- picture_t *picture = (picture_t *)buffer->user_data;
++ picture_sys_t *pic_sys = picture->p_sys;
+
+- if (picture)
+- picture_Release(picture);
++ if (!sys->adjust_refresh_rate || pic_sys->displayed)
++ return;
+
+- vlc_mutex_lock(&sys->buffer_mutex);
+- atomic_fetch_sub(&sys->buffers_in_transit, 1);
+- vlc_cond_signal(&sys->buffer_cond);
+- vlc_mutex_unlock(&sys->buffer_mutex);
++ /* Apply the required phase_offset to the picture, so that vd_display()
++ * will be called at the corrected time from the core */
++ picture->date += sys->phase_offset;
++#endif
+ }
+
+-static int query_resolution(vout_display_t *vd, unsigned *width, unsigned *height)
++
++static void vd_control_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer)
+ {
+- TV_DISPLAY_STATE_T display_state;
+- int ret = 0;
++ vout_display_t *vd = (vout_display_t *)port->userdata;
++ MMAL_STATUS_T status;
+
+- if (vc_tv_get_display_state(&display_state) == 0) {
+- if (display_state.state & 0xFF) {
+- *width = display_state.display.hdmi.width;
+- *height = display_state.display.hdmi.height;
+- } else if (display_state.state & 0xFF00) {
+- *width = display_state.display.sdtv.width;
+- *height = display_state.display.sdtv.height;
+- } else {
+- msg_Warn(vd, "Invalid display state %"PRIx32, display_state.state);
+- ret = -1;
+- }
+- } else {
+- msg_Warn(vd, "Failed to query display resolution");
+- ret = -1;
++ if (buffer->cmd == MMAL_EVENT_ERROR) {
++ status = *(uint32_t *)buffer->data;
++ msg_Err(vd, "MMAL error %"PRIx32" \"%s\"", status, mmal_status_to_string(status));
+ }
+
+- return ret;
++ mmal_buffer_header_release(buffer);
+ }
+
+ static void tvservice_cb(void *callback_data, uint32_t reason, uint32_t param1, uint32_t param2)
+@@ -780,9 +1226,9 @@ static void adjust_refresh_rate(vout_dis
+ double best_score, score;
+ int i;
+
+- vc_tv_get_display_state(&display_state);
++ vc_tv_get_display_state_id(sys->display_id, &display_state);
+ if(display_state.display.hdmi.mode != HDMI_MODE_OFF) {
+- num_modes = vc_tv_hdmi_get_supported_modes_new(display_state.display.hdmi.group,
++ num_modes = vc_tv_hdmi_get_supported_modes_new_id(sys->display_id, display_state.display.hdmi.group,
+ supported_modes, VC_TV_MAX_MODE_IDS, NULL, NULL);
+
+ for (i = 0; i < num_modes; ++i) {
+@@ -810,7 +1256,7 @@ static void adjust_refresh_rate(vout_dis
+ if((best_id >= 0) && (display_state.display.hdmi.mode != supported_modes[best_id].code)) {
+ msg_Info(vd, "Setting HDMI refresh rate to %"PRIu32,
+ supported_modes[best_id].frame_rate);
+- vc_tv_hdmi_power_on_explicit_new(HDMI_MODE_HDMI,
++ vc_tv_hdmi_power_on_explicit_new_id(sys->display_id, HDMI_MODE_HDMI,
+ supported_modes[best_id].group,
+ supported_modes[best_id].code);
+ }
+@@ -828,148 +1274,12 @@ static void adjust_refresh_rate(vout_dis
+ }
+ }
+
+-static void display_subpicture(vout_display_t *vd, subpicture_t *subpicture)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- struct dmx_region_t **dmx_region = &sys->dmx_region;
+- struct dmx_region_t *unused_dmx_region;
+- DISPMANX_UPDATE_HANDLE_T update = 0;
+- picture_t *picture;
+- video_format_t *fmt;
+- struct dmx_region_t *dmx_region_next;
+-
+- if(subpicture) {
+- subpicture_region_t *region = subpicture->p_region;
+- while(region) {
+- picture = region->p_picture;
+- fmt = &region->fmt;
+-
+- if(!*dmx_region) {
+- if(!update)
+- update = vc_dispmanx_update_start(10);
+- *dmx_region = dmx_region_new(vd, update, region);
+- } else if(((*dmx_region)->bmp_rect.width != (int32_t)fmt->i_visible_width) ||
+- ((*dmx_region)->bmp_rect.height != (int32_t)fmt->i_visible_height) ||
+- ((*dmx_region)->pos_x != region->i_x) ||
+- ((*dmx_region)->pos_y != region->i_y) ||
+- ((*dmx_region)->alpha.opacity != (uint32_t)region->i_alpha)) {
+- dmx_region_next = (*dmx_region)->next;
+- if(!update)
+- update = vc_dispmanx_update_start(10);
+- dmx_region_delete(*dmx_region, update);
+- *dmx_region = dmx_region_new(vd, update, region);
+- (*dmx_region)->next = dmx_region_next;
+- } else if((*dmx_region)->picture != picture) {
+- if(!update)
+- update = vc_dispmanx_update_start(10);
+- dmx_region_update(*dmx_region, update, picture);
+- }
+-
+- dmx_region = &(*dmx_region)->next;
+- region = region->p_next;
+- }
+- }
+-
+- /* Remove remaining regions */
+- unused_dmx_region = *dmx_region;
+- while(unused_dmx_region) {
+- dmx_region_next = unused_dmx_region->next;
+- if(!update)
+- update = vc_dispmanx_update_start(10);
+- dmx_region_delete(unused_dmx_region, update);
+- unused_dmx_region = dmx_region_next;
+- }
+- *dmx_region = NULL;
+-
+- if(update)
+- vc_dispmanx_update_submit_sync(update);
+-}
+-
+-static void close_dmx(vout_display_t *vd)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- DISPMANX_UPDATE_HANDLE_T update = vc_dispmanx_update_start(10);
+- struct dmx_region_t *dmx_region = sys->dmx_region;
+- struct dmx_region_t *dmx_region_next;
+-
+- while(dmx_region) {
+- dmx_region_next = dmx_region->next;
+- dmx_region_delete(dmx_region, update);
+- dmx_region = dmx_region_next;
+- }
+-
+- vc_dispmanx_update_submit_sync(update);
+- sys->dmx_region = NULL;
+-
+- show_background(vd, false);
+-
+- vc_dispmanx_display_close(sys->dmx_handle);
+- sys->dmx_handle = DISPMANX_NO_HANDLE;
+-}
+-
+-static struct dmx_region_t *dmx_region_new(vout_display_t *vd,
+- DISPMANX_UPDATE_HANDLE_T update, subpicture_region_t *region)
+-{
+- vout_display_sys_t *sys = vd->sys;
+- video_format_t *fmt = &region->fmt;
+- struct dmx_region_t *dmx_region = malloc(sizeof(struct dmx_region_t));
+- uint32_t image_handle;
+-
+- dmx_region->pos_x = region->i_x;
+- dmx_region->pos_y = region->i_y;
+-
+- vc_dispmanx_rect_set(&dmx_region->bmp_rect, 0, 0, fmt->i_visible_width,
+- fmt->i_visible_height);
+- vc_dispmanx_rect_set(&dmx_region->src_rect, 0, 0, fmt->i_visible_width << 16,
+- fmt->i_visible_height << 16);
+- vc_dispmanx_rect_set(&dmx_region->dst_rect, region->i_x, region->i_y,
+- fmt->i_visible_width, fmt->i_visible_height);
+-
+- dmx_region->resource = vc_dispmanx_resource_create(VC_IMAGE_RGBA32,
+- dmx_region->bmp_rect.width | (region->p_picture->p[0].i_pitch << 16),
+- dmx_region->bmp_rect.height | (dmx_region->bmp_rect.height << 16),
+- &image_handle);
+- vc_dispmanx_resource_write_data(dmx_region->resource, VC_IMAGE_RGBA32,
+- region->p_picture->p[0].i_pitch,
+- region->p_picture->p[0].p_pixels, &dmx_region->bmp_rect);
+-
+- dmx_region->alpha.flags = DISPMANX_FLAGS_ALPHA_FROM_SOURCE | DISPMANX_FLAGS_ALPHA_MIX;
+- dmx_region->alpha.opacity = region->i_alpha;
+- dmx_region->alpha.mask = DISPMANX_NO_HANDLE;
+- dmx_region->element = vc_dispmanx_element_add(update, sys->dmx_handle,
+- sys->layer + 1, &dmx_region->dst_rect, dmx_region->resource,
+- &dmx_region->src_rect, DISPMANX_PROTECTION_NONE,
+- &dmx_region->alpha, NULL, VC_IMAGE_ROT0);
+-
+- dmx_region->next = NULL;
+- dmx_region->picture = region->p_picture;
+-
+- return dmx_region;
+-}
+-
+-static void dmx_region_update(struct dmx_region_t *dmx_region,
+- DISPMANX_UPDATE_HANDLE_T update, picture_t *picture)
+-{
+- vc_dispmanx_resource_write_data(dmx_region->resource, VC_IMAGE_RGBA32,
+- picture->p[0].i_pitch, picture->p[0].p_pixels, &dmx_region->bmp_rect);
+- vc_dispmanx_element_change_source(update, dmx_region->element, dmx_region->resource);
+- dmx_region->picture = picture;
+-}
+-
+-static void dmx_region_delete(struct dmx_region_t *dmx_region,
+- DISPMANX_UPDATE_HANDLE_T update)
+-{
+- vc_dispmanx_element_remove(update, dmx_region->element);
+- vc_dispmanx_resource_delete(dmx_region->resource);
+- free(dmx_region);
+-}
+-
+ static void maintain_phase_sync(vout_display_t *vd)
+ {
+ MMAL_PARAMETER_VIDEO_RENDER_STATS_T render_stats = {
+ .hdr = { MMAL_PARAMETER_VIDEO_RENDER_STATS, sizeof(render_stats) },
+ };
+- int32_t frame_duration = 1000000 /
++ int32_t frame_duration = CLOCK_FREQ /
+ ((double)vd->sys->i_frame_rate /
+ vd->sys->i_frame_rate_base);
+ vout_display_sys_t *sys = vd->sys;
+@@ -1012,32 +1322,436 @@ static void maintain_phase_sync(vout_dis
+ }
+ }
+
+-static void show_background(vout_display_t *vd, bool enable)
++static void CloseMmalVout(vlc_object_t *object)
+ {
+- vout_display_sys_t *sys = vd->sys;
+- uint32_t image_ptr, color = 0xFF000000;
+- VC_RECT_T dst_rect, src_rect;
+- DISPMANX_UPDATE_HANDLE_T update;
+-
+- if (enable && !sys->bkg_element) {
+- sys->bkg_resource = vc_dispmanx_resource_create(VC_IMAGE_RGBA32, 1, 1,
+- &image_ptr);
+- vc_dispmanx_rect_set(&dst_rect, 0, 0, 1, 1);
+- vc_dispmanx_resource_write_data(sys->bkg_resource, VC_IMAGE_RGBA32,
+- sizeof(color), &color, &dst_rect);
+- vc_dispmanx_rect_set(&src_rect, 0, 0, 1 << 16, 1 << 16);
+- vc_dispmanx_rect_set(&dst_rect, 0, 0, 0, 0);
+- update = vc_dispmanx_update_start(0);
+- sys->bkg_element = vc_dispmanx_element_add(update, sys->dmx_handle,
+- sys->layer - 1, &dst_rect, sys->bkg_resource, &src_rect,
+- DISPMANX_PROTECTION_NONE, NULL, NULL, VC_IMAGE_ROT0);
+- vc_dispmanx_update_submit_sync(update);
+- } else if (!enable && sys->bkg_element) {
+- update = vc_dispmanx_update_start(0);
+- vc_dispmanx_element_remove(update, sys->bkg_element);
+- vc_dispmanx_resource_delete(sys->bkg_resource);
+- vc_dispmanx_update_submit_sync(update);
+- sys->bkg_element = DISPMANX_NO_HANDLE;
+- sys->bkg_resource = DISPMANX_NO_HANDLE;
++ vout_display_t * const vd = (vout_display_t *)object;
++ vout_display_sys_t * const sys = vd->sys;
++ char response[20]; /* answer is hvs_update_fields=%1d */
++
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s", __func__);
++#endif
++
++ kill_pool(sys);
++
++ vc_tv_unregister_callback_full(tvservice_cb, vd);
++
++ // Shouldn't be anything here - but just in case
++ for (unsigned int i = 0; i != SUBS_MAX; ++i)
++ if (sys->subpic_bufs[i] != NULL)
++ mmal_buffer_header_release(sys->subpic_bufs[i]);
++
++ for (unsigned int i = 0; i != SUBS_MAX; ++i) {
++ vout_subpic_t * const sub = sys->subs + i;
++ if (sub->component != NULL) {
++ hw_mmal_subpic_close(VLC_OBJECT(vd), &sub->sub);
++ if (sub->component->control->is_enabled)
++ mmal_port_disable(sub->component->control);
++ if (sub->component->is_enabled)
++ mmal_component_disable(sub->component);
++ mmal_component_release(sub->component);
++ sub->component = NULL;
++ }
+ }
++
++ if (sys->input && sys->input->is_enabled)
++ mmal_port_disable(sys->input);
++
++ if (sys->component && sys->component->control->is_enabled)
++ mmal_port_disable(sys->component->control);
++
++ if (sys->copy_buf != NULL)
++ mmal_buffer_header_release(sys->copy_buf);
++
++ if (sys->input != NULL && sys->copy_pool != NULL)
++ mmal_port_pool_destroy(sys->input, sys->copy_pool);
++
++ if (sys->component && sys->component->is_enabled)
++ mmal_component_disable(sys->component);
++
++ if (sys->pool)
++ mmal_pool_destroy(sys->pool);
++
++ if (sys->component)
++ mmal_component_release(sys->component);
++
++ isp_close(vd, sys);
++
++ hw_mmal_vzc_pool_release(sys->vzc);
++
++ vlc_mutex_destroy(&sys->manage_mutex);
++
++ if (sys->native_interlaced) {
++ if (vc_gencmd(response, sizeof(response), "hvs_update_fields 0") < 0 ||
++ response[18] != '0')
++ msg_Warn(vd, "Could not reset hvs field mode");
++ }
++
++ cma_vcsm_exit(sys->init_type);;
++
++ free(sys);
++
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s", __func__);
++#endif
++}
++
++
++static const struct {
++ const char * name;
++ int num;
++} display_name_to_num[] = {
++ {"auto", -1},
++ {"hdmi-1", DISPMANX_ID_HDMI0},
++ {"hdmi-2", DISPMANX_ID_HDMI1},
++ {NULL, -2}
++};
++
++static const struct {
++ const char * name;
++ int transform_num;
++} transform_name_to_num[] = {
++ {"auto", -1},
++ {"0", MMAL_DISPLAY_ROT0},
++ {"hflip", MMAL_DISPLAY_MIRROR_ROT0},
++ {"vflip", MMAL_DISPLAY_MIRROR_ROT180},
++ {"180", MMAL_DISPLAY_ROT180},
++ {"transpose", MMAL_DISPLAY_MIRROR_ROT90},
++ {"270", MMAL_DISPLAY_ROT270},
++ {"90", MMAL_DISPLAY_ROT90},
++ {"antitranspose", MMAL_DISPLAY_MIRROR_ROT270},
++ {NULL, -2}
++};
++
++static int find_display_num(const char * const name)
++{
++ unsigned int i;
++ for (i = 0; display_name_to_num[i].name != NULL && strcasecmp(display_name_to_num[i].name, name) != 0; ++i)
++ /* Loop */;
++ return display_name_to_num[i].num;
++}
++
++static int find_transform_num(const char * const name)
++{
++ unsigned int i;
++ for (i = 0; transform_name_to_num[i].name != NULL && strcasecmp(transform_name_to_num[i].name, name) != 0; ++i)
++ /* Loop */;
++ return transform_name_to_num[i].transform_num;
++}
++
++#if HAVE_X11_XLIB_H
++#include <X11/Xlib.h>
++#include <X11/extensions/Xrandr.h>
++static MMAL_DISPLAYTRANSFORM_T get_xrandr_rotation(vout_display_t * const vd)
++{
++ Display * const x = XOpenDisplay(NULL);
++ Rotation cur_rot = 0;
++ MMAL_DISPLAYTRANSFORM_T trans;
++
++ if (x == NULL)
++ return MMAL_DISPLAY_ROT0;
++
++ XRRRotations(x, 0, &cur_rot);
++ XCloseDisplay(x);
++
++ // Convert to MMAL
++ // xrandr seems to rotate the other way to mmal
++
++ switch (cur_rot)
++ {
++ case 0:
++ case RR_Rotate_0:
++ trans = MMAL_DISPLAY_ROT0;
++ break;
++ case RR_Rotate_90:
++ trans = MMAL_DISPLAY_ROT270;
++ break;
++ case RR_Rotate_180:
++ trans = MMAL_DISPLAY_ROT180;
++ break;
++ case RR_Rotate_270:
++ trans = MMAL_DISPLAY_ROT90;
++ break;
++ case RR_Reflect_X:
++ trans = MMAL_DISPLAY_MIRROR_ROT0;
++ break;
++ case RR_Reflect_Y:
++ trans = MMAL_DISPLAY_MIRROR_ROT180;
++ break;
++ default:
++ msg_Info(vd, "Unexpected X rotation value: %#x", cur_rot);
++ trans = MMAL_DISPLAY_ROT0;
++ break;
++ }
++
++ return trans;
++}
++#else
++static MMAL_DISPLAYTRANSFORM_T get_xrandr_rotation(vout_display_t * const vd)
++{
++ VLC_UNUSED(vd);
++ return MMAL_DISPLAY_ROT0;
++}
++#endif
++
++static MMAL_RECT_T str_to_rect(const char * s)
++{
++ MMAL_RECT_T rect = {0};
++ rect.width = strtoul(s, (char**)&s, 0);
++ if (*s == '\0')
++ return rect;
++ if (*s++ != 'x')
++ goto fail;
++ rect.height = strtoul(s, (char**)&s, 0);
++ if (*s == '\0')
++ return rect;
++ if (*s++ != '+')
++ goto fail;
++ rect.x = strtoul(s, (char**)&s, 0);
++ if (*s == '\0')
++ return rect;
++ if (*s++ != '+')
++ goto fail;
++ rect.y = strtoul(s, (char**)&s, 0);
++ if (*s != '\0')
++ goto fail;
++ return rect;
++
++fail:
++ return (MMAL_RECT_T){0,0,0,0};
++}
++
++static int OpenMmalVout(vlc_object_t *object)
++{
++ vout_display_t *vd = (vout_display_t *)object;
++ vout_display_sys_t *sys;
++ MMAL_STATUS_T status;
++ int ret = VLC_EGENERIC;
++ // At the moment all copy is via I420
++ const bool needs_copy = !hw_mmal_chroma_is_mmal(vd->fmt.i_chroma);
++ const MMAL_FOURCC_T enc_in = needs_copy ? MMAL_ENCODING_I420 :
++ vout_vlc_to_mmal_pic_fourcc(vd->fmt.i_chroma);
++
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s: o:%d", __func__, (int)vd->fmt.orientation);
++#endif
++
++ get_xrandr_rotation(vd);
++
++ sys = calloc(1, sizeof(struct vout_display_sys_t));
++ if (!sys)
++ return VLC_ENOMEM;
++ vd->sys = sys;
++
++ vlc_mutex_init(&sys->manage_mutex);
++
++ if ((sys->init_type = cma_vcsm_init()) == VCSM_INIT_NONE)
++ {
++ msg_Err(vd, "VCSM init fail");
++ goto fail;
++ }
++
++ vc_tv_register_callback(tvservice_cb, vd);
++
++ sys->layer = var_InheritInteger(vd, MMAL_LAYER_NAME);
++ sys->transparent = var_InheritBool(vd, MMAL_VOUT_TRANSPARENT_NAME);
++
++ {
++ const char *display_name = var_InheritString(vd, MMAL_DISPLAY_NAME);
++ int qt_num = var_InheritInteger(vd, "qt-fullscreen-screennumber" );
++ int display_id = find_display_num(display_name);
++// sys->display_id = display_id < 0 ? vc_tv_get_default_display_id() : display_id;
++ sys->display_id = display_id >= 0 ? display_id :
++ qt_num == 1 ? DISPMANX_ID_HDMI1 : DISPMANX_ID_HDMI;
++ if (display_id < -1)
++ msg_Warn(vd, "Unknown display device: '%s'", display_name);
++ else
++ msg_Dbg(vd, "Display device: %s, qt=%d id=%d display=%d", display_name,
++ qt_num, display_id, sys->display_id);
++ }
++
++ {
++ const char *window_str = var_InheritString(vd, MMAL_VOUT_WINDOW_NAME);
++ sys->req_win = str_to_rect(window_str);
++ if (sys->req_win.width != 0)
++ msg_Dbg(vd, "Window: %dx%d @ %d,%d",
++ sys->req_win.width, sys->req_win.height,
++ sys->req_win.x, sys->req_win.y);
++ }
++
++ {
++ const char *transform_name = var_InheritString(vd, MMAL_VOUT_TRANSFORM_NAME);
++ int transform_num = find_transform_num(transform_name);
++ sys->display_transform = transform_num < 0 ?
++ get_xrandr_rotation(vd) :
++ (MMAL_DISPLAYTRANSFORM_T)transform_num;
++
++ if (transform_num < -1)
++ msg_Warn(vd, "Unknown vout transform: '%s'", transform_name);
++ else
++ msg_Dbg(vd, "Display transform: %s, mmal_display_transform=%d",
++ transform_name, (int)sys->display_transform);
++
++ sys->video_transform = combine_transform(
++ vlc_to_mmal_transform(vd->fmt.orientation), sys->display_transform);
++ sys->dest_transform = transform_inverse(sys->display_transform);
++ }
++
++ status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to create MMAL component %s (status=%"PRIx32" %s)",
++ MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ sys->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
++ status = mmal_port_enable(sys->component->control, vd_control_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable control port %s (status=%"PRIx32" %s)",
++ sys->component->control->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ sys->input = sys->component->input[0];
++ sys->input->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
++
++ sys->input->format->encoding = enc_in;
++ sys->input->format->encoding_variant = 0;
++ sys->i_planes = 1;
++
++ display_set_format(vd, sys->input->format, want_isp(vd));
++
++ status = port_parameter_set_bool(sys->input, MMAL_PARAMETER_ZERO_COPY, true);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to set zero copy on port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ status = mmal_port_format_commit(sys->input);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to commit format for input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ sys->input->buffer_size = sys->input->buffer_size_recommended;
++
++ if (!needs_copy) {
++ sys->input->buffer_num = 30;
++ }
++ else {
++ sys->input->buffer_num = 2;
++ if ((sys->copy_pool = mmal_port_pool_create(sys->input, 2, sys->input->buffer_size)) == NULL)
++ {
++ msg_Err(vd, "Cannot create copy pool");
++ goto fail;
++ }
++ }
++
++ set_display_windows(vd, sys);
++
++ configure_display(vd, vd->cfg, &vd->source);
++
++ status = mmal_port_enable(sys->input, vd_input_port_cb);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable input port %s (status=%"PRIx32" %s)",
++ sys->input->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ status = mmal_component_enable(sys->component);
++ if (status != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable component %s (status=%"PRIx32" %s)",
++ sys->component->name, status, mmal_status_to_string(status));
++ goto fail;
++ }
++
++ if ((sys->pool = mmal_pool_create(sys->input->buffer_num, 0)) == NULL)
++ {
++ msg_Err(vd, "Failed to create input pool");
++ goto fail;
++ }
++
++ for (unsigned int i = 0; i != SUBS_MAX; ++i) {
++ vout_subpic_t * const sub = sys->subs + i;
++ if ((status = mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &sub->component)) != MMAL_SUCCESS)
++ {
++ msg_Dbg(vd, "Failed to create subpic component %d", i);
++ goto fail;
++ }
++ sub->component->control->userdata = (struct MMAL_PORT_USERDATA_T *)vd;
++ if ((status = mmal_port_enable(sub->component->control, vd_control_port_cb)) != MMAL_SUCCESS) {
++ msg_Err(vd, "Failed to enable control port %s on sub %d (status=%"PRIx32" %s)",
++ sys->component->control->name, i, status, mmal_status_to_string(status));
++ goto fail;
++ }
++ if ((status = hw_mmal_subpic_open(VLC_OBJECT(vd), &sub->sub, sub->component->input[0],
++ sys->display_id, sys->layer + i + 1)) != MMAL_SUCCESS) {
++ msg_Dbg(vd, "Failed to open subpic %d", i);
++ goto fail;
++ }
++ if ((status = mmal_component_enable(sub->component)) != MMAL_SUCCESS)
++ {
++ msg_Dbg(vd, "Failed to enable subpic component %d", i);
++ goto fail;
++ }
++ }
++
++ // If we can't deal with it directly ask for I420
++ vd->fmt.i_chroma = req_chroma(vd);
++
++ vd->info = (vout_display_info_t){
++ .is_slow = false,
++ .has_double_click = false,
++ .needs_hide_mouse = false,
++ .has_pictures_invalid = true,
++ .subpicture_chromas = hw_mmal_vzc_subpicture_chromas
++ };
++
++ vd->pool = vd_pool;
++ vd->prepare = vd_prepare;
++ vd->display = vd_display;
++ vd->control = vd_control;
++
++
++ msg_Dbg(vd, ">>> %s: ok", __func__);
++ return VLC_SUCCESS;
++
++fail:
++ CloseMmalVout(object);
++
++ msg_Dbg(vd, ">>> %s: rv=%d", __func__, ret);
++
++ return ret == VLC_SUCCESS ? VLC_EGENERIC : ret;
+ }
++
++vlc_module_begin()
++
++ add_submodule()
++
++ set_shortname(N_("MMAL vout"))
++ set_description(N_("MMAL-based vout plugin for Raspberry Pi"))
++ set_capability("vout display", 16) // 1 point better than ASCII art
++ add_shortcut("mmal_vout")
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VOUT )
++
++ add_integer(MMAL_LAYER_NAME, 1, MMAL_LAYER_TEXT, MMAL_LAYER_LONGTEXT, false)
++ add_bool(MMAL_ADJUST_REFRESHRATE_NAME, false, MMAL_ADJUST_REFRESHRATE_TEXT,
++ MMAL_ADJUST_REFRESHRATE_LONGTEXT, false)
++ add_bool(MMAL_NATIVE_INTERLACED, false, MMAL_NATIVE_INTERLACE_TEXT,
++ MMAL_NATIVE_INTERLACE_LONGTEXT, false)
++ add_string(MMAL_DISPLAY_NAME, "auto", MMAL_DISPLAY_TEXT,
++ MMAL_DISPLAY_LONGTEXT, false)
++ add_string(MMAL_VOUT_TRANSFORM_NAME, "auto", MMAL_VOUT_TRANSFORM_TEXT,
++ MMAL_VOUT_TRANSFORM_LONGTEXT, false)
++ add_string(MMAL_VOUT_WINDOW_NAME, "fullscreen", MMAL_VOUT_WINDOW_TEXT,
++ MMAL_VOUT_WINDOW_LONGTEXT, false)
++ add_bool(MMAL_VOUT_TRANSPARENT_NAME, false, MMAL_VOUT_TRANSPARENT_TEXT,
++ MMAL_VOUT_TRANSPARENT_LONGTEXT, false)
++ set_callbacks(OpenMmalVout, CloseMmalVout)
++
++vlc_module_end()
++
++
+--- /dev/null
++++ b/modules/hw/mmal/xsplitter.c
+@@ -0,0 +1,584 @@
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <stdatomic.h>
++
++#include <vlc_common.h>
++#include <vlc_plugin.h>
++#include <vlc_threads.h>
++#include <vlc_vout_display.h>
++#include <vlc_modules.h>
++
++#include <bcm_host.h>
++#include <interface/mmal/mmal.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++
++#include "mmal_picture.h"
++
++#define TRACE_ALL 0
++
++typedef struct display_desc_s
++{
++ vout_display_t * vout;
++ unsigned int max_pels;
++} display_desc_t;
++
++typedef struct mmal_x11_sys_s
++{
++ bool use_mmal;
++ display_desc_t * cur_desc;
++ display_desc_t mmal_desc;
++ display_desc_t x_desc;
++ uint32_t changed;
++ vlc_fourcc_t subpicture_chromas[16];
++} mmal_x11_sys_t;
++
++#define MAX_GL_PELS (1920*1080)
++#define MAX_MMAL_PELS (4096*4096) // Should never be hit
++
++#if 0
++// Gen prog for the following table
++// Not done inline in case we end up pulling in FP libs we don't want
++#include <math.h>
++#include <stdio.h>
++
++int main(int argc, char *argv[])
++{
++ unsigned int i;
++ for (i = 0; i != 64; ++i)
++ {
++ printf(" [%2u]=%5u,", i, (unsigned int)(0.5 + (1/sqrt((i + 5)/4.0) * 65536.0)));
++ if (i % 4 == 3)
++ printf("\n");
++ }
++}
++#endif
++
++static const uint16_t sqrt_tab[64] = {
++ [ 0]=58617, [ 1]=53510, [ 2]=49541, [ 3]=46341,
++ [ 4]=43691, [ 5]=41449, [ 6]=39520, [ 7]=37837,
++ [ 8]=36353, [ 9]=35030, [10]=33843, [11]=32768,
++ [12]=31790, [13]=30894, [14]=30070, [15]=29309,
++ [16]=28602, [17]=27945, [18]=27330, [19]=26755,
++ [20]=26214, [21]=25705, [22]=25225, [23]=24770,
++ [24]=24339, [25]=23930, [26]=23541, [27]=23170,
++ [28]=22817, [29]=22479, [30]=22155, [31]=21845,
++ [32]=21548, [33]=21263, [34]=20988, [35]=20724,
++ [36]=20470, [37]=20225, [38]=19988, [39]=19760,
++ [40]=19539, [41]=19326, [42]=19119, [43]=18919,
++ [44]=18725, [45]=18536, [46]=18354, [47]=18176,
++ [48]=18004, [49]=17837, [50]=17674, [51]=17515,
++ [52]=17361, [53]=17211, [54]=17064, [55]=16921,
++ [56]=16782, [57]=16646, [58]=16514, [59]=16384,
++ [60]=16257, [61]=16134, [62]=16013, [63]=15895
++};
++#define SQRT_MAX (sizeof(sqrt_tab)/sizeof(sqrt_tab[0]) - 1)
++
++static bool cpy_fmt_limit_size(const display_desc_t * const dd,
++ video_format_t * const dst,
++ const video_format_t * const src)
++{
++ const unsigned int src_pel = src->i_visible_width * src->i_visible_height;
++
++ *dst = *src;
++
++ if (src_pel <= dd->max_pels)
++ return false;
++
++ // scaling factor sqrt(max_pel/cur_pel)
++ // sqrt done by lookup & 16 bit fixed-point maths - not exactly accurate but
++ // easily good enough & avoids floating point (which may be slow)
++ // src_pel > max_pel so n >= 0
++ // Rounding should be such that exact sqrts work and everything else rounds
++ // down
++ unsigned int n = ((src_pel * 4 - 1) / dd->max_pels) - 4;
++ unsigned int scale = sqrt_tab[n >= SQRT_MAX ? SQRT_MAX : n];
++
++ // Rescale width - rounding up to 16
++ unsigned int width = ((src->i_visible_width * scale + (16 << 16) - 1) >> 16) & ~15;
++ // Rescale height based on new width
++ unsigned int height = (src->i_visible_height * width + src->i_visible_width/2) / src->i_visible_width;
++
++// fprintf(stderr, "%dx%d -> %dx%d\n", src->i_visible_width, src->i_visible_height, width, height);
++
++ dst->i_width = width;
++ dst->i_visible_width = width;
++ dst->i_height = height;
++ dst->i_visible_height = height;
++ return true;
++}
++
++static void unload_display_module(vout_display_t * const x_vout)
++{
++ if (x_vout != NULL) {
++ if (x_vout->module != NULL) {
++ module_unneed(x_vout, x_vout->module);
++ }
++ vlc_object_release(x_vout);
++ }
++}
++
++static void CloseMmalX11(vlc_object_t *object)
++{
++ vout_display_t * const vd = (vout_display_t *)object;
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++
++ msg_Dbg(vd, "<<< %s", __func__);
++
++ if (sys == NULL)
++ return;
++
++ unload_display_module(sys->x_desc.vout);
++
++ unload_display_module(sys->mmal_desc.vout);
++
++ free(sys);
++
++ msg_Dbg(vd, ">>> %s", __func__);
++}
++
++static void mmal_x11_event(vout_display_t * x_vd, int cmd, va_list args)
++{
++ vout_display_t * const vd = x_vd->owner.sys;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s (cmd=%d)", __func__, cmd);
++#endif
++
++ // Do not fall into the display assert if Invalid not supported
++ if (cmd == VOUT_DISPLAY_EVENT_PICTURES_INVALID &&
++ !vd->info.has_pictures_invalid)
++ return;
++
++ vd->owner.event(vd, cmd, args);
++}
++
++static vout_window_t * mmal_x11_window_new(vout_display_t * x_vd, unsigned type)
++{
++ vout_display_t * const vd = x_vd->owner.sys;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s (type=%d)", __func__, type);
++#endif
++ return vd->owner.window_new(vd, type);
++}
++
++static void mmal_x11_window_del(vout_display_t * x_vd, vout_window_t * win)
++{
++ vout_display_t * const vd = x_vd->owner.sys;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s", __func__);
++#endif
++ vd->owner.window_del(vd, win);
++}
++
++
++static int load_display_module(vout_display_t * const vd,
++ display_desc_t * const dd,
++ const char * const cap,
++ const char * const module_name)
++{
++ vout_display_t * const x_vout = vlc_object_create(vd, sizeof(*x_vout));
++
++ dd->vout = NULL;
++ if (!x_vout)
++ return -1;
++
++ x_vout->owner.sys = vd;
++ x_vout->owner.event = mmal_x11_event;
++ x_vout->owner.window_new = mmal_x11_window_new;
++ x_vout->owner.window_del = mmal_x11_window_del;
++
++ x_vout->cfg = vd->cfg;
++ x_vout->info = vd->info;
++ cpy_fmt_limit_size(dd, &x_vout->source, &vd->source);
++ cpy_fmt_limit_size(dd, &x_vout->fmt, &vd->fmt);
++
++ if ((x_vout->module = module_need(x_vout, cap, module_name, true)) == NULL)
++ {
++ msg_Err(vd, "Failed to open Xsplitter:%s module", module_name);
++ goto fail;
++ }
++
++ msg_Dbg(vd, "R/G/B: %08x/%08x/%08x", x_vout->fmt.i_rmask, x_vout->fmt.i_gmask, x_vout->fmt.i_bmask);
++
++ dd->vout = x_vout;
++ return 0;
++
++fail:
++ vlc_object_release(x_vout);
++ return -1;
++}
++
++
++/* Return a pointer over the current picture_pool_t* (mandatory).
++ *
++ * For performance reasons, it is best to provide at least count
++ * pictures but it is not mandatory.
++ * You can return NULL when you cannot/do not want to allocate
++ * pictures.
++ * The vout display module keeps the ownership of the pool and can
++ * destroy it only when closing or on invalid pictures control.
++ */
++static picture_pool_t * mmal_x11_pool(vout_display_t * vd, unsigned count)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ vout_display_t * const x_vd = sys->cur_desc->vout;
++#if TRACE_ALL
++ char buf0[5];
++ msg_Dbg(vd, "<<< %s (count=%d) %s:%dx%d->%s:%dx%d", __func__, count,
++ str_fourcc(buf0, vd->fmt.i_chroma),
++ vd->fmt.i_width, vd->fmt.i_height,
++ str_fourcc(buf0, x_vd->fmt.i_chroma),
++ x_vd->fmt.i_width, x_vd->fmt.i_height);
++#endif
++ picture_pool_t * pool = x_vd->pool(x_vd, count);
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s: %p", __func__, pool);
++#endif
++ return pool;
++}
++
++/* Prepare a picture and an optional subpicture for display (optional).
++ *
++ * It is called before the next pf_display call to provide as much
++ * time as possible to prepare the given picture and the subpicture
++ * for display.
++ * You are guaranted that pf_display will always be called and using
++ * the exact same picture_t and subpicture_t.
++ * You cannot change the pixel content of the picture_t or of the
++ * subpicture_t.
++ */
++static void mmal_x11_prepare(vout_display_t * vd, picture_t * pic, subpicture_t * sub)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ vout_display_t * const x_vd = sys->cur_desc->vout;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s", __func__);
++#endif
++ if (x_vd->prepare)
++ x_vd->prepare(x_vd, pic, sub);
++}
++
++/* Display a picture and an optional subpicture (mandatory).
++ *
++ * The picture and the optional subpicture must be displayed as soon as
++ * possible.
++ * You cannot change the pixel content of the picture_t or of the
++ * subpicture_t.
++ *
++ * This function gives away the ownership of the picture and of the
++ * subpicture, so you must release them as soon as possible.
++ */
++static void mmal_x11_display(vout_display_t * vd, picture_t * pic, subpicture_t * sub)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ vout_display_t * const x_vd = sys->cur_desc->vout;
++
++#if TRACE_ALL
++ const bool is_mmal_pic = hw_mmal_pic_is_mmal(pic);
++ msg_Dbg(vd, "<<< %s: fmt: %dx%d/%dx%d, pic:%dx%d, pts=%lld, mmal=%d/%d", __func__, vd->fmt.i_width, vd->fmt.i_height, x_vd->fmt.i_width, x_vd->fmt.i_height, pic->format.i_width, pic->format.i_height, (long long)pic->date,
++ is_mmal_pic, sys->use_mmal);
++#endif
++
++ if (x_vd->fmt.i_chroma != pic->format.i_chroma ||
++ x_vd->fmt.i_width != pic->format.i_width ||
++ x_vd->fmt.i_height != pic->format.i_height)
++ {
++ msg_Dbg(vd, "%s: Picture dropped", __func__);
++ picture_Release(pic);
++ if (sub != NULL)
++ subpicture_Delete(sub);
++ return;
++ }
++
++ x_vd->display(x_vd, pic, sub);
++}
++
++
++static int vout_display_Control(const display_desc_t * const dd, int query, ...)
++{
++ va_list args;
++ int result;
++
++ va_start(args, query);
++ result = dd->vout->control(dd->vout, query, args);
++ va_end(args);
++
++ return result;
++}
++
++static bool want_mmal_vout(vout_display_t * const vd, const mmal_x11_sys_t * const sys)
++{
++ return sys->mmal_desc.vout != NULL &&
++ (sys->x_desc.vout == NULL || var_InheritBool(vd, "fullscreen"));
++}
++
++static inline int
++up_rv(const int a, const int b)
++{
++ return a != 0 ? a : b;
++}
++
++static int
++reset_pictures(vout_display_t * const vd, const display_desc_t * const desc)
++{
++ int rv = 0;
++ VLC_UNUSED(vd);
++ if (desc->vout)
++ {
++ // If the display doesn't have has_pictures_invalid then it doesn't
++ // expect RESET_PICTURES
++ if (desc->vout->info.has_pictures_invalid)
++ vout_display_Control(desc, VOUT_DISPLAY_RESET_PICTURES);
++ }
++ return rv;
++}
++
++static int
++replay_controls(vout_display_t * const vd, const display_desc_t * const desc, const int32_t changed)
++{
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_DISPLAY_FILLED)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_DISPLAY_FILLED, vd->cfg);
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_ZOOM)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_ZOOM, vd->cfg);
++ if ((changed & ((1 << VOUT_DISPLAY_CHANGE_SOURCE_CROP) |
++ (1 << VOUT_DISPLAY_CHANGE_SOURCE_ASPECT))) != 0)
++ cpy_fmt_limit_size(desc, &desc->vout->source, &vd->source);
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_SOURCE_ASPECT)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_SOURCE_ASPECT);
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_SOURCE_CROP)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_SOURCE_CROP);
++ if ((changed & (1 << VOUT_DISPLAY_CHANGE_VIEWPOINT)) != 0)
++ vout_display_Control(desc, VOUT_DISPLAY_CHANGE_VIEWPOINT, vd->cfg);
++ return 0;
++}
++
++/* Control on the module (mandatory) */
++static int mmal_x11_control(vout_display_t * vd, int ctl, va_list va)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ display_desc_t *x_desc = sys->cur_desc;
++ int rv;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s[%d] (ctl=%d)", __func__, sys->use_mmal, ctl);
++#endif
++ // Remember what we've told this vd - unwanted ctls ignored on replay
++ if (ctl >= 0 && ctl <= 31)
++ sys->changed |= (1 << ctl);
++
++ switch (ctl) {
++ case VOUT_DISPLAY_CHANGE_DISPLAY_SIZE:
++ {
++ const vout_display_cfg_t * const cfg = va_arg(va, const vout_display_cfg_t *);
++ const bool want_mmal = want_mmal_vout(vd, sys);
++ const bool swap_vout = (sys->use_mmal != want_mmal);
++ display_desc_t * const new_desc = want_mmal ? &sys->mmal_desc : &sys->x_desc;
++
++ msg_Dbg(vd, "Change size: %d, %d: mmal_vout=%p, want_mmal=%d, fs=%d",
++ cfg->display.width, cfg->display.height, sys->mmal_desc.vout, want_mmal,
++ var_InheritBool(vd, "fullscreen"));
++
++ // Repeat any control calls that we sent to the previous vd
++ if (swap_vout && sys->changed != 0) {
++ const uint32_t changed = sys->changed;
++ sys->changed = 0;
++ replay_controls(vd, new_desc, changed);
++ }
++
++ if (swap_vout) {
++ if (sys->use_mmal) {
++ vout_display_Control(x_desc, VOUT_DISPLAY_CHANGE_MMAL_HIDE);
++ }
++ vout_display_SendEventPicturesInvalid(vd);
++ }
++
++ rv = vout_display_Control(new_desc, ctl, cfg);
++ if (rv == VLC_SUCCESS) {
++ vd->fmt = new_desc->vout->fmt;
++ sys->cur_desc = new_desc;
++ sys->use_mmal = want_mmal;
++ }
++
++
++ break;
++ }
++
++ case VOUT_DISPLAY_RESET_PICTURES:
++ {
++ char dbuf0[5], dbuf1[5], dbuf2[5];
++ msg_Dbg(vd, "<<< %s: Pic reset: fmt: %s,%dx%d<-%s,%dx%d, source: %s,%dx%d/%dx%d", __func__,
++ str_fourcc(dbuf0, vd->fmt.i_chroma), vd->fmt.i_width, vd->fmt.i_height,
++ str_fourcc(dbuf1, x_desc->vout->fmt.i_chroma), x_desc->vout->fmt.i_width, x_desc->vout->fmt.i_height,
++ str_fourcc(dbuf2, vd->source.i_chroma), vd->source.i_width, vd->source.i_height, x_desc->vout->source.i_width,
++ x_desc->vout->source.i_height);
++ }
++ rv = reset_pictures(vd, &sys->x_desc);
++ rv = up_rv(rv, reset_pictures(vd, &sys->mmal_desc));
++
++ vd->fmt = x_desc->vout->fmt;
++ break;
++
++ case VOUT_DISPLAY_CHANGE_SOURCE_ASPECT:
++ case VOUT_DISPLAY_CHANGE_SOURCE_CROP:
++ cpy_fmt_limit_size(x_desc, &x_desc->vout->source, &vd->source);
++
++ /* FALLTHRU */
++ default:
++ rv = x_desc->vout->control(x_desc->vout, ctl, va);
++// vd->fmt = x_vd->fmt;
++ break;
++ }
++#if TRACE_ALL
++ msg_Dbg(vd, ">>> %s (rv=%d)", __func__, rv);
++#endif
++ return rv;
++}
++
++#define DO_MANAGE 0
++
++#if DO_MANAGE
++/* Manage pending event (optional) */
++static void mmal_x11_manage(vout_display_t * vd)
++{
++ mmal_x11_sys_t * const sys = (mmal_x11_sys_t *)vd->sys;
++ vout_display_t * const x_vd = sys->cur_desc->vout;
++#if TRACE_ALL
++ msg_Dbg(vd, "<<< %s", __func__);
++#endif
++ x_vd->manage(x_vd);
++}
++#endif
++
++static int OpenMmalX11(vlc_object_t *object)
++{
++ vout_display_t * const vd = (vout_display_t *)object;
++ mmal_x11_sys_t * const sys = calloc(1, sizeof(*sys));
++ int ret = VLC_SUCCESS;
++
++ if (sys == NULL) {
++ return VLC_EGENERIC;
++ }
++ vd->sys = (vout_display_sys_t *)sys;
++
++ vd->info = (vout_display_info_t){
++ .is_slow = false,
++ .has_double_click = false,
++ .needs_hide_mouse = false,
++ .has_pictures_invalid = true,
++ .subpicture_chromas = NULL
++ };
++
++ {
++ char dbuf0[5];
++ msg_Dbg(vd, ">>> %s: %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d", __func__,
++ str_fourcc(dbuf0, vd->fmt.i_chroma),
++ vd->fmt.i_width, vd->fmt.i_height,
++ vd->fmt.i_x_offset, vd->fmt.i_y_offset,
++ vd->fmt.i_visible_width, vd->fmt.i_visible_height,
++ vd->fmt.i_sar_num, vd->fmt.i_sar_den);
++ }
++
++ sys->x_desc.max_pels = MAX_GL_PELS;
++ sys->mmal_desc.max_pels = MAX_MMAL_PELS;
++
++ if (load_display_module(vd, &sys->x_desc, "vout display", "opengles2") == 0)
++ {
++ msg_Dbg(vd, "Opengles2 output found");
++ }
++ else
++ {
++ sys->x_desc.max_pels = MAX_MMAL_PELS;
++ if (load_display_module(vd, &sys->x_desc, "vout display", "xcb_x11") == 0)
++ msg_Dbg(vd, "X11 XCB output found");
++ }
++
++ if ((load_display_module(vd, &sys->mmal_desc, "vout display", "mmal_vout")) == 0)
++ msg_Dbg(vd, "MMAL output found");
++
++ if (sys->mmal_desc.vout == NULL && sys->x_desc.vout == NULL) {
++ char dbuf0[5], dbuf1[5];
++ msg_Info(vd, "No valid output found for vout (%s/%s)", str_fourcc(dbuf0, vd->fmt.i_chroma), str_fourcc(dbuf1, vd->source.i_chroma));
++ goto fail;
++ }
++
++ vd->pool = mmal_x11_pool;
++ vd->prepare = mmal_x11_prepare;
++ vd->display = mmal_x11_display;
++ vd->control = mmal_x11_control;
++#if DO_MANAGE
++ vd->manage = mmal_x11_manage;
++#endif
++
++ if (want_mmal_vout(vd, sys)) {
++ sys->cur_desc = &sys->mmal_desc;
++ sys->use_mmal = true;
++ }
++ else {
++ sys->cur_desc = &sys->x_desc;
++ sys->use_mmal = false;
++ }
++
++ if (sys->mmal_desc.vout == NULL || sys->x_desc.vout == NULL) {
++ vd->info = sys->cur_desc->vout->info;
++ vd->info.has_pictures_invalid = true; // Should make this unwanted
++ }
++ else {
++ // We have both - construct a combination
++ vd->info = (vout_display_info_t){
++ .is_slow = false,
++ .has_double_click = sys->mmal_desc.vout->info.has_double_click || sys->x_desc.vout->info.has_double_click,
++ .needs_hide_mouse = sys->mmal_desc.vout->info.needs_hide_mouse || sys->x_desc.vout->info.needs_hide_mouse,
++ .has_pictures_invalid = true,
++ };
++ // Construct intersection of subpicture chromas
++ // sys calloced so no need to add the terminating zero
++ if (sys->mmal_desc.vout->info.subpicture_chromas != NULL && sys->x_desc.vout->info.subpicture_chromas != NULL) {
++ unsigned int n = 0;
++ // N^2 - fix if we ever care
++ for (const vlc_fourcc_t * p1 = sys->mmal_desc.vout->info.subpicture_chromas; *p1 != 0 && n != 15; ++p1) {
++ for (const vlc_fourcc_t * p2 = sys->x_desc.vout->info.subpicture_chromas; *p2 != 0; ++p2) {
++ if (*p1 == *p2) {
++ sys->subpicture_chromas[n++] = *p1;
++ break;
++ }
++ }
++ }
++ if (n != 0)
++ vd->info.subpicture_chromas = sys->subpicture_chromas;
++ }
++ }
++ vd->fmt = sys->cur_desc->vout->fmt;
++
++#if TRACE_ALL
++ {
++ char dbuf0[5];
++ msg_Dbg(vd, ">>> %s: (%s) %s,%dx%d [(%d,%d) %d/%d] sar:%d/%d", __func__,
++ module_get_name(sys->cur_desc->vout->module, false),
++ str_fourcc(dbuf0, vd->fmt.i_chroma),
++ vd->fmt.i_width, vd->fmt.i_height,
++ vd->fmt.i_x_offset, vd->fmt.i_y_offset,
++ vd->fmt.i_visible_width, vd->fmt.i_visible_height,
++ vd->fmt.i_sar_num, vd->fmt.i_sar_den);
++ }
++#endif
++ return VLC_SUCCESS;
++
++fail:
++ CloseMmalX11(VLC_OBJECT(vd));
++ return ret == VLC_SUCCESS ? VLC_EGENERIC : ret;
++}
++
++
++
++
++vlc_module_begin()
++ set_shortname(N_("MMAL x11 splitter"))
++ set_description(N_("MMAL x11 splitter for Raspberry Pi"))
++ set_capability("vout display", 300) // Between GLES & GL
++ add_shortcut("mmal_x11")
++ set_category( CAT_VIDEO )
++ set_subcategory( SUBCAT_VIDEO_VOUT )
++ set_callbacks(OpenMmalX11, CloseMmalX11)
++vlc_module_end()
++
+--- a/modules/video_output/opengl/egl.c
++++ b/modules/video_output/opengl/egl.c
+@@ -43,6 +43,8 @@
+ # include "../android/utils.h"
+ #endif
+
++#define REQUIRE_DMA_BUF_IMPORT 1
++
+ typedef struct vlc_gl_sys_t
+ {
+ EGLDisplay display;
+@@ -355,6 +357,14 @@ static int Open (vlc_object_t *obj, cons
+ goto error;
+ }
+
++#if REQUIRE_DMA_BUF_IMPORT
++ if (!CheckToken(ext, "EGL_EXT_image_dma_buf_import"))
++ {
++ msg_Dbg(obj, "No dma_buf_import - fall back to X");
++ goto error;
++ }
++#endif
++
+ const EGLint conf_attr[] = {
+ EGL_RED_SIZE, 5,
+ EGL_GREEN_SIZE, 5,
+--- a/src/input/decoder.c
++++ b/src/input/decoder.c
+@@ -1995,6 +1995,7 @@ void input_DecoderDelete( decoder_t *p_d
+ vlc_mutex_lock( &p_owner->lock );
+ p_owner->b_waiting = false;
+ vlc_cond_signal( &p_owner->wait_request );
++ vlc_mutex_unlock( &p_owner->lock );
+
+ /* If the video output is paused or slow, or if the picture pool size was
+ * under-estimated (e.g. greedy video filter, buggy decoder...), the
+@@ -2005,7 +2006,6 @@ void input_DecoderDelete( decoder_t *p_d
+ * worker threads (if any) and the decoder thread to terminate. */
+ if( p_owner->p_vout != NULL )
+ vout_Cancel( p_owner->p_vout, true );
+- vlc_mutex_unlock( &p_owner->lock );
+
+ vlc_join( p_owner->thread, NULL );
+
+--- a/src/misc/fourcc.c
++++ b/src/misc/fourcc.c
+@@ -755,8 +755,13 @@ static const struct
+ { { VLC_CODEC_VDPAU_VIDEO_420, VLC_CODEC_VDPAU_VIDEO_422,
+ VLC_CODEC_VDPAU_VIDEO_444, VLC_CODEC_VDPAU_OUTPUT },
+ FAKE_FMT() },
+- { { VLC_CODEC_ANDROID_OPAQUE, VLC_CODEC_MMAL_OPAQUE,
+- VLC_CODEC_D3D9_OPAQUE, VLC_CODEC_D3D11_OPAQUE },
++ { { VLC_CODEC_ANDROID_OPAQUE }, FAKE_FMT() },
++ { { VLC_CODEC_MMAL_OPAQUE, VLC_CODEC_MMAL_ZC_SAND30 },
++ FAKE_FMT() },
++ { { VLC_CODEC_MMAL_ZC_I420, VLC_CODEC_MMAL_ZC_SAND8,
++ VLC_CODEC_MMAL_ZC_SAND10, VLC_CODEC_MMAL_ZC_RGB32 },
++ FAKE_FMT() },
++ { { VLC_CODEC_D3D9_OPAQUE, VLC_CODEC_D3D11_OPAQUE },
+ FAKE_FMT() },
+ { { VLC_CODEC_D3D11_OPAQUE_10B, VLC_CODEC_D3D9_OPAQUE_10B },
+ FAKE_FMT() },
+--- a/src/misc/picture.c
++++ b/src/misc/picture.c
+@@ -365,10 +365,30 @@ void picture_CopyProperties( picture_t *
+ p_dst->b_top_field_first = p_src->b_top_field_first;
+ }
+
++static inline bool is_zc_chroma(const vlc_fourcc_t i_chroma)
++{
++ return i_chroma == VLC_CODEC_MMAL_OPAQUE ||
++ i_chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_RGB32 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND10 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND30 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND8;
++}
++
+ void picture_CopyPixels( picture_t *p_dst, const picture_t *p_src )
+ {
+- for( int i = 0; i < p_src->i_planes ; i++ )
+- plane_CopyPixels( p_dst->p+i, p_src->p+i );
++ if( is_zc_chroma(p_src->format.i_chroma) )
++ {
++ assert(p_dst->i_planes == 0);
++ p_dst->i_planes = p_src->i_planes;
++ for( int i = 0; i < p_src->i_planes; i++ )
++ p_dst->p[i] = p_src->p[i];
++ }
++ else
++ {
++ for( int i = 0; i < p_src->i_planes; i++ )
++ plane_CopyPixels( p_dst->p+i, p_src->p+i );
++ }
+
+ assert( p_dst->context == NULL );
+
+--- a/src/video_output/video_output.c
++++ b/src/video_output/video_output.c
+@@ -964,6 +964,17 @@ static picture_t *ConvertRGB32AndBlend(v
+ return NULL;
+ }
+
++
++static inline bool is_zc_chroma(const vlc_fourcc_t i_chroma)
++{
++ return i_chroma == VLC_CODEC_MMAL_OPAQUE ||
++ i_chroma == VLC_CODEC_MMAL_ZC_I420 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_RGB32 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND10 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND30 ||
++ i_chroma == VLC_CODEC_MMAL_ZC_SAND8;
++}
++
+ static int ThreadDisplayRenderPicture(vout_thread_t *vout, bool is_forced)
+ {
+ vout_thread_sys_t *sys = vout->p;
+@@ -1098,7 +1109,7 @@ static int ThreadDisplayRenderPicture(vo
+ }
+
+ assert(vout_IsDisplayFiltered(vd) == !sys->display.use_dr);
+- if (sys->display.use_dr && !is_direct) {
++ if (sys->display.use_dr && !is_direct && !is_zc_chroma(todisplay->format.i_chroma)) {
+ picture_t *direct = NULL;
+ if (likely(vout->p->display_pool != NULL))
+ direct = picture_pool_Get(vout->p->display_pool);
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0005-mmal_exit_fix.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0005-mmal_exit_fix.patch
new file mode 100644
index 0000000..d8fc7fb
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0005-mmal_exit_fix.patch
@@ -0,0 +1,19 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+--- a/bin/vlc.c
++++ b/bin/vlc.c
+@@ -106,7 +106,10 @@ static void vlc_kill (void *data)
+ static void exit_timeout (int signum)
+ {
+ (void) signum;
+- signal (SIGINT, SIG_DFL);
++// This doesn't seem to be strong enough to reliably kill us if we fail to exit
++// in a timely fashion - so upgrade to _exit().
++// signal (SIGINT, SIG_DFL);
++ _exit(0);
+ }
+
+ /*****************************************************************************
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0006-mmal_chain.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0006-mmal_chain.patch
new file mode 100644
index 0000000..99fd03e
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0006-mmal_chain.patch
@@ -0,0 +1,19 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+--- a/modules/video_chroma/chain.c
++++ b/modules/video_chroma/chain.c
+@@ -280,8 +280,9 @@ static int BuildTransformChain( filter_t
+ return VLC_SUCCESS;
+
+ /* Lets try resize+chroma first, then transform */
+- msg_Dbg( p_filter, "Trying to build chroma+resize" );
+- EsFormatMergeSize( &fmt_mid, &p_filter->fmt_out, &p_filter->fmt_in );
++ msg_Dbg( p_filter, "Trying to build chroma+resize, then transform" );
++ es_format_Copy( &fmt_mid, &p_filter->fmt_out );
++ video_format_TransformTo(&fmt_mid.video, p_filter->fmt_in.video.orientation);
+ i_ret = CreateChain( p_filter, &fmt_mid );
+ es_format_Clean( &fmt_mid );
+ if( i_ret == VLC_SUCCESS )
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0007-armv6.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0007-armv6.patch
new file mode 100644
index 0000000..64a2426
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0007-armv6.patch
@@ -0,0 +1,53 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+--- a/modules/hw/mmal/blend_rgba_neon.S
++++ b/modules/hw/mmal/blend_rgba_neon.S
+@@ -1,10 +1,10 @@
+- .syntax unified
+- .arm
+-// .thumb
+- .text
++#include "../../arm_neon/asm.S"
+ .align 16
+ .arch armv7-a
+- .fpu neon-vfpv4
++ .syntax unified
++#if HAVE_AS_FPU_DIRECTIVE
++ .fpu neon-vfpv4
++#endif
+
+ @ blend_rgbx_rgba_neon
+
+--- a/modules/hw/mmal/codec.c
++++ b/modules/hw/mmal/codec.c
+@@ -29,6 +29,7 @@
+ #include <stdatomic.h>
+
+ #include <vlc_common.h>
++#include <vlc_cpu.h>
+ #include <vlc_plugin.h>
+ #include <vlc_codec.h>
+ #include <vlc_filter.h>
+@@ -2311,6 +2312,9 @@ static int OpenBlendMmal(vlc_object_t *o
+ filter_t * const p_filter = (filter_t *)object;
+ const vlc_fourcc_t vfcc_dst = p_filter->fmt_out.video.i_chroma;
+
++ if (!vlc_CPU_ARM_NEON())
++ return VLC_EGENERIC;
++
+ if (!hw_mmal_chroma_is_mmal(vfcc_dst) ||
+ !hw_mmal_vzc_subpic_fmt_valid(&p_filter->fmt_in.video))
+ {
+@@ -2421,6 +2425,9 @@ static int OpenBlendNeon(vlc_object_t *o
+ MMAL_FOURCC_T mfcc_dst = vlc_to_mmal_video_fourcc(&p_filter->fmt_out.video);
+ blend_neon_fn * blend_fn = (blend_neon_fn *)0;
+
++ if (!vlc_CPU_ARM_NEON())
++ return VLC_EGENERIC;
++
+ // Non-alpha RGB only for dest
+ if (vfcc_dst != VLC_CODEC_RGB32)
+ return VLC_EGENERIC;
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0008-configure-Disable-incompatible-function-pointer-type.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0008-configure-Disable-incompatible-function-pointer-type.patch
new file mode 100644
index 0000000..3dbd08d
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0008-configure-Disable-incompatible-function-pointer-type.patch
@@ -0,0 +1,26 @@
+From 048e4fdd08ac588feb27b03e3ec1824e24f77d62 Mon Sep 17 00:00:00 2001
+From: Khem Raj <raj.khem@gmail.com>
+Date: Sun, 5 Mar 2023 14:13:25 -0800
+Subject: [PATCH 3/3] configure: Disable incompatible-function-pointer-types
+ warning
+
+Upstream-Status: Pending
+Signed-off-by: Khem Raj <raj.khem@gmail.com>
+---
+ configure.ac | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/configure.ac
++++ b/configure.ac
+@@ -105,6 +105,11 @@ AC_SUBST([AM_CFLAGS], [-fcommon])
+ dnl Prevent clang from accepting unknown flags with a mere warning
+ AX_APPEND_COMPILE_FLAGS([-Werror=unknown-warning-option -Werror=invalid-command-line-argument], [CFLAGS])
+ AX_APPEND_COMPILE_FLAGS([-Werror=unknown-warning-option -Werror=invalid-command-line-argument], [CXXFLAGS])
++dnl disable clang from erroring on function pointer protype mismatch, vlc seems to rely on that
++dnl especially in modules/video_filter/deinterlace/algo_yadif.c how it interpolates 'filter` variable
++dnl between different functions yadif_filter_line_c_16bit() and yadif_filter_line_c()
++AX_APPEND_COMPILE_FLAGS([-Wno-error=incompatible-function-pointer-types -Wno-error=incompatible-function-pointer-types], [CFLAGS])
++AX_APPEND_COMPILE_FLAGS([-Wno-error=incompatible-function-pointer-types -Wno-error=incompatible-function-pointer-types], [CXXFLAGS])
+
+ dnl
+ dnl Check the operating system
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0009-demux-dash-include-cstdint-needed-for-uint64_t.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0009-demux-dash-include-cstdint-needed-for-uint64_t.patch
new file mode 100644
index 0000000..c526535
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/0009-demux-dash-include-cstdint-needed-for-uint64_t.patch
@@ -0,0 +1,30 @@
+From 6fca76ebd76bf8fce9b111e31bda64015cdc770f Mon Sep 17 00:00:00 2001
+From: Johannes Kauffmann <johanneskauffmann@hotmail.com>
+Date: Mon, 11 Jul 2022 19:35:57 +0000
+Subject: [PATCH] demux: dash: include cstdint, needed for uint64_t
+
+Fixes #27077.
+
+Upstream-Status: Backport
+
+https://github.com/videolan/vlc/commit/6fca76ebd76bf8fce9b111e31bda64015cdc770f
+
+---
+ modules/demux/dash/mpd/TemplatedUri.hpp | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/modules/demux/dash/mpd/TemplatedUri.hpp b/modules/demux/dash/mpd/TemplatedUri.hpp
+index 1eeb70cbb6..7f7264a9c8 100644
+--- a/modules/demux/dash/mpd/TemplatedUri.hpp
++++ b/modules/demux/dash/mpd/TemplatedUri.hpp
+@@ -21,6 +21,7 @@
+ #ifndef TEMPLATEDURI_HPP
+ #define TEMPLATEDURI_HPP
+
++#include <cstdint>
+ #include <string>
+
+ namespace dash
+--
+2.34.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2001-fix-luaL-checkint.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2001-fix-luaL-checkint.patch
new file mode 100644
index 0000000..e8990fc
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2001-fix-luaL-checkint.patch
@@ -0,0 +1,236 @@
+* luaL_checkint and luaL_optint were deprecated in lua 5.3
+* replacement functions are luaL_checkinteger and luaL_optinteger
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+Signed-off-by: Tim Orling <TicoTimo@gmail.com>
+
+--- a/modules/lua/demux.c
++++ b/modules/lua/demux.c
+@@ -52,7 +52,7 @@ struct vlclua_playlist
+ static int vlclua_demux_peek( lua_State *L )
+ {
+ stream_t *s = (stream_t *)vlclua_get_this(L);
+- int n = luaL_checkint( L, 1 );
++ int n = luaL_checkinteger( L, 1 );
+ const uint8_t *p_peek;
+
+ ssize_t val = vlc_stream_Peek(s->p_source, &p_peek, n);
+@@ -66,7 +66,7 @@ static int vlclua_demux_peek( lua_State
+ static int vlclua_demux_read( lua_State *L )
+ {
+ stream_t *s = (stream_t *)vlclua_get_this(L);
+- int n = luaL_checkint( L, 1 );
++ int n = luaL_checkinteger( L, 1 );
+ char *buf = malloc(n);
+
+ if (buf != NULL)
+--- a/modules/lua/libs/net.c
++++ b/modules/lua/libs/net.c
+@@ -179,7 +179,7 @@ static int vlclua_net_listen_tcp( lua_St
+ {
+ vlc_object_t *p_this = vlclua_get_this( L );
+ const char *psz_host = luaL_checkstring( L, 1 );
+- int i_port = luaL_checkint( L, 2 );
++ int i_port = luaL_checkinteger( L, 2 );
+ int *pi_fd = net_ListenTCP( p_this, psz_host, i_port );
+ if( pi_fd == NULL )
+ return luaL_error( L, "Cannot listen on %s:%d", psz_host, i_port );
+@@ -251,7 +251,7 @@ static int vlclua_net_connect_tcp( lua_S
+ {
+ vlc_object_t *p_this = vlclua_get_this( L );
+ const char *psz_host = luaL_checkstring( L, 1 );
+- int i_port = luaL_checkint( L, 2 );
++ int i_port = luaL_checkinteger( L, 2 );
+ int i_fd = net_ConnectTCP( p_this, psz_host, i_port );
+ lua_pushinteger( L, vlclua_fd_map_safe( L, i_fd ) );
+ return 1;
+@@ -259,14 +259,14 @@ static int vlclua_net_connect_tcp( lua_S
+
+ static int vlclua_net_close( lua_State *L )
+ {
+- int i_fd = luaL_checkint( L, 1 );
++ int i_fd = luaL_checkinteger( L, 1 );
+ vlclua_fd_unmap_safe( L, i_fd );
+ return 0;
+ }
+
+ static int vlclua_net_send( lua_State *L )
+ {
+- int fd = vlclua_fd_get( L, luaL_checkint( L, 1 ) );
++ int fd = vlclua_fd_get( L, luaL_checkinteger( L, 1 ) );
+ size_t i_len;
+ const char *psz_buffer = luaL_checklstring( L, 2, &i_len );
+
+@@ -278,7 +278,7 @@ static int vlclua_net_send( lua_State *L
+
+ static int vlclua_net_recv( lua_State *L )
+ {
+- int fd = vlclua_fd_get( L, luaL_checkint( L, 1 ) );
++ int fd = vlclua_fd_get( L, luaL_checkinteger( L, 1 ) );
+ size_t i_len = (size_t)luaL_optinteger( L, 2, 1 );
+ char psz_buffer[i_len];
+
+@@ -312,7 +312,7 @@ static int vlclua_net_poll( lua_State *L
+ lua_pushnil( L );
+ for( int i = 0; lua_next( L, 1 ); i++ )
+ {
+- luafds[i] = luaL_checkint( L, -2 );
++ luafds[i] = luaL_checkinteger( L, -2 );
+ p_fds[i].fd = vlclua_fd_get( L, luafds[i] );
+ p_fds[i].events = luaL_checkinteger( L, -1 );
+ p_fds[i].events &= POLLIN | POLLOUT | POLLPRI;
+@@ -360,7 +360,7 @@ static int vlclua_fd_open( lua_State *L
+ #ifndef _WIN32
+ static int vlclua_fd_write( lua_State *L )
+ {
+- int fd = vlclua_fd_get( L, luaL_checkint( L, 1 ) );
++ int fd = vlclua_fd_get( L, luaL_checkinteger( L, 1 ) );
+ size_t i_len;
+ const char *psz_buffer = luaL_checklstring( L, 2, &i_len );
+
+@@ -371,7 +371,7 @@ static int vlclua_fd_write( lua_State *L
+
+ static int vlclua_fd_read( lua_State *L )
+ {
+- int fd = vlclua_fd_get( L, luaL_checkint( L, 1 ) );
++ int fd = vlclua_fd_get( L, luaL_checkinteger( L, 1 ) );
+ size_t i_len = (size_t)luaL_optinteger( L, 2, 1 );
+ char psz_buffer[i_len];
+
+--- a/modules/lua/libs/osd.c
++++ b/modules/lua/libs/osd.c
+@@ -154,7 +154,7 @@ static int vlc_osd_slider_type_from_stri
+
+ static int vlclua_osd_slider( lua_State *L )
+ {
+- int i_position = luaL_checkint( L, 1 );
++ int i_position = luaL_checkinteger( L, 1 );
+ const char *psz_type = luaL_checkstring( L, 2 );
+ int i_type = vlc_osd_slider_type_from_string( psz_type );
+ int i_chan = (int)luaL_optinteger( L, 3, VOUT_SPU_CHANNEL_OSD );
+@@ -198,7 +198,7 @@ static int vlclua_spu_channel_register(
+
+ static int vlclua_spu_channel_clear( lua_State *L )
+ {
+- int i_chan = luaL_checkint( L, 1 );
++ int i_chan = luaL_checkinteger( L, 1 );
+ input_thread_t *p_input = vlclua_get_input_internal( L );
+ if( !p_input )
+ return luaL_error( L, "Unable to find input." );
+--- a/modules/lua/libs/playlist.c
++++ b/modules/lua/libs/playlist.c
+@@ -69,7 +69,7 @@ static int vlclua_playlist_next( lua_Sta
+
+ static int vlclua_playlist_skip( lua_State * L )
+ {
+- int i_skip = luaL_checkint( L, 1 );
++ int i_skip = luaL_checkinteger( L, 1 );
+ playlist_t *p_playlist = vlclua_get_playlist_internal( L );
+ playlist_Skip( p_playlist, i_skip );
+ return 0;
+@@ -127,7 +127,7 @@ static int vlclua_playlist_random( lua_S
+
+ static int vlclua_playlist_gotoitem( lua_State * L )
+ {
+- int i_id = luaL_checkint( L, 1 );
++ int i_id = luaL_checkinteger( L, 1 );
+ playlist_t *p_playlist = vlclua_get_playlist_internal( L );
+ PL_LOCK;
+ playlist_ViewPlay( p_playlist, NULL,
+@@ -138,7 +138,7 @@ static int vlclua_playlist_gotoitem( lua
+
+ static int vlclua_playlist_delete( lua_State * L )
+ {
+- int i_id = luaL_checkint( L, 1 );
++ int i_id = luaL_checkinteger( L, 1 );
+ playlist_t *p_playlist = vlclua_get_playlist_internal( L );
+
+ PL_LOCK;
+@@ -152,8 +152,8 @@ static int vlclua_playlist_delete( lua_S
+
+ static int vlclua_playlist_move( lua_State * L )
+ {
+- int i_item = luaL_checkint( L, 1 );
+- int i_target = luaL_checkint( L, 2 );
++ int i_item = luaL_checkinteger( L, 1 );
++ int i_target = luaL_checkinteger( L, 2 );
+ playlist_t *p_playlist = vlclua_get_playlist_internal( L );
+ PL_LOCK;
+ playlist_item_t *p_item = playlist_ItemGetById( p_playlist, i_item );
+--- a/modules/lua/libs/stream.c
++++ b/modules/lua/libs/stream.c
+@@ -123,7 +123,7 @@ static int vlclua_stream_read( lua_State
+ {
+ int i_read;
+ stream_t **pp_stream = (stream_t **)luaL_checkudata( L, 1, "stream" );
+- int n = luaL_checkint( L, 2 );
++ int n = luaL_checkinteger( L, 2 );
+ uint8_t *p_read = malloc( n );
+ if( !p_read ) return vlclua_error( L );
+
+--- a/modules/lua/libs/volume.c
++++ b/modules/lua/libs/volume.c
+@@ -48,7 +48,7 @@
+ static int vlclua_volume_set( lua_State *L )
+ {
+ playlist_t *p_this = vlclua_get_playlist_internal( L );
+- int i_volume = luaL_checkint( L, 1 );
++ int i_volume = luaL_checkinteger( L, 1 );
+ if( i_volume < 0 )
+ i_volume = 0;
+ int i_ret = playlist_VolumeSet( p_this, i_volume/(float)AOUT_VOLUME_DEFAULT );
+--- a/modules/lua/libs/dialog.c
++++ b/modules/lua/libs/dialog.c
+@@ -382,7 +382,7 @@ static int lua_GetDialogUpdate( lua_Stat
+ /* Read entry in the Lua registry */
+ lua_pushlightuserdata( L, (void*) &key_update );
+ lua_gettable( L, LUA_REGISTRYINDEX );
+- return luaL_checkint( L, -1 );
++ return luaL_checkinteger( L, -1 );
+ }
+
+ /** Manually update a dialog
+@@ -573,22 +573,22 @@ static int vlclua_create_widget_inner( l
+
+ /* Set common arguments: col, row, hspan, vspan, width, height */
+ if( lua_isnumber( L, arg ) )
+- p_widget->i_column = luaL_checkint( L, arg );
++ p_widget->i_column = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_row = luaL_checkint( L, arg );
++ p_widget->i_row = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_horiz_span = luaL_checkint( L, arg );
++ p_widget->i_horiz_span = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_vert_span = luaL_checkint( L, arg );
++ p_widget->i_vert_span = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_width = luaL_checkint( L, arg );
++ p_widget->i_width = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+ if( lua_isnumber( L, ++arg ) )
+- p_widget->i_height = luaL_checkint( L, arg );
++ p_widget->i_height = luaL_checkinteger( L, arg );
+ else goto end_of_args;
+
+ end_of_args:
+--- a/modules/lua/libs/io.c
++++ b/modules/lua/libs/io.c
+@@ -139,7 +139,7 @@ static int vlclua_io_file_seek( lua_Stat
+ const char* psz_mode = luaL_optstring( L, 2, NULL );
+ if ( psz_mode != NULL )
+ {
+- long i_offset = luaL_optlong( L, 3, 0 );
++ long i_offset = (long)luaL_optinteger( L, 3, 0 );
+ int i_mode;
+ if ( !strcmp( psz_mode, "set" ) )
+ i_mode = SEEK_SET;
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2002-use-vorbisidec.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2002-use-vorbisidec.patch
new file mode 100644
index 0000000..bfabf21
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/2002-use-vorbisidec.patch
@@ -0,0 +1,33 @@
+From d0a7ba506fd302ad195f79f287b5a5a154ac02a3 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Sun, 4 Dec 2022 16:09:51 -0600
+Subject: [PATCH] tremor provides libvorbisidec, use it instead of libvorbisdec
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+THIS PATCHES HAS BEEN REIMPLEMENTED INORDER TO APPLY PROPERLY.
+
+Signed-off-by: Tim Orling <TicoTimo@gmail.com>
+---
+ modules/codec/Makefile.am | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/modules/codec/Makefile.am b/modules/codec/Makefile.am
+index 3dadf1119..8b6189e92 100644
+--- a/modules/codec/Makefile.am
++++ b/modules/codec/Makefile.am
+@@ -324,7 +324,7 @@ codec_LTLIBRARIES += $(LTLIBdaala)
+ libtremor_plugin_la_SOURCES = codec/vorbis.c
+ libtremor_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -DMODULE_NAME_IS_tremor
+ libtremor_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(codecdir)'
+-libtremor_plugin_la_LIBADD = -lvorbisdec -logg
++libtremor_plugin_la_LIBADD = -lvorbisidec -logg
+ EXTRA_LTLIBRARIES += libtremor_plugin.la
+ codec_LTLIBRARIES += $(LTLIBtremor)
+
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3001-configure.ac-setup-for-OE-usage.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3001-configure.ac-setup-for-OE-usage.patch
new file mode 100644
index 0000000..d676be3
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3001-configure.ac-setup-for-OE-usage.patch
@@ -0,0 +1,124 @@
+From ddc2ea76058466b45a1acf37bed0d794cd3112a3 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 19:04:42 -0600
+Subject: [PATCH] configure.ac: setup for OE usage
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+Need to use userland graphics libraries package files as it's best
+to not assume /opt/vc is where all libs and headers are installed per
+distro. Also, needed to include $BCMHOST_MMAL_LIBS variable as
+AC_CHECK_LIB(bcm_host) fails to find `vc_tv_unregister_callback_full`.
+Adding $BCMHOST_MMAL_LIBS uses all libs inside
+bcm_host.pc, mmal.pc, vcsm.pc, openmaxil.pc files when checking
+for `vc_tv_unregister_callback_full` function.
+
+Supposed to change linked version to opengl to GLESv2
+
+Ensure correct package config file is used for:
+* opencv
+* freerdp
+
+Adds Workaround for modules/codec/omxil/omxil_core.h
+ multiple definition of `pf_enable_graphic_buffers'
+ multiple definition of `pf_get_graphic_buffer_usage'
+ multiple definition of `pf_get_hal_format'
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ configure.ac | 34 ++++++++++++++++++++++------------
+ 1 file changed, 22 insertions(+), 12 deletions(-)
+
+diff --git a/configure.ac b/configure.ac
+index a72dca0b6..5b8585a26 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -95,6 +95,13 @@ AS_IF([test -n "${with_binary_version}"],[
+ [Binary specific version])
+ ])
+
++# Workaround for modules/codec/omxil/omxil_core.h
++# multiple definition of `pf_enable_graphic_buffers'
++# multiple definition of `pf_get_graphic_buffer_usage'
++# multiple definition of `pf_get_hal_format'
++AC_SUBST([AM_CXXFLAGS], [-fcommon])
++AC_SUBST([AM_CFLAGS], [-fcommon])
++
+ dnl Prevent clang from accepting unknown flags with a mere warning
+ AX_APPEND_COMPILE_FLAGS([-Werror=unknown-warning-option -Werror=invalid-command-line-argument], [CFLAGS])
+ AX_APPEND_COMPILE_FLAGS([-Werror=unknown-warning-option -Werror=invalid-command-line-argument], [CXXFLAGS])
+@@ -1900,7 +1907,7 @@ PKG_ENABLE_MODULES_VLC([BLURAY], [libbluray], [libbluray >= 0.6.2], (libbluray f
+ dnl
+ dnl OpenCV wrapper and example filters
+ dnl
+-PKG_ENABLE_MODULES_VLC([OPENCV], [opencv_example opencv_wrapper], [opencv > 2.0], (OpenCV (computer vision) filter), [auto])
++PKG_ENABLE_MODULES_VLC([OPENCV], [opencv_example opencv_wrapper], [opencv4 >= 2.0], (OpenCV (computer vision) filter), [auto])
+
+
+ dnl
+@@ -2077,7 +2084,7 @@ PKG_ENABLE_MODULES_VLC([VNC], [vnc], [libvncclient >= 0.9.9], (VNC/rfb client su
+
+ dnl RDP/Remote Desktop access module
+ dnl
+-PKG_ENABLE_MODULES_VLC([FREERDP], [rdp], [freerdp >= 1.0.1], (RDP/Remote Desktop client support) )
++PKG_ENABLE_MODULES_VLC([FREERDP], [rdp], [freerdp2 >= 1.0.1], (RDP/Remote Desktop client support) )
+
+ dnl
+ dnl Real RTSP plugin
+@@ -3089,14 +3096,14 @@ PKG_CHECK_MODULES([GL], [gl], [
+ #ifdef _WIN32
+ # include <GL/glew.h>
+ #endif
+-#include <GL/gl.h>
++#include <GLES2/gl2.h>
+ ]], [
+ [int t0 = GL_TEXTURE0;]])
+ ], [
+ GL_CFLAGS=""
+ have_gl="yes"
+ AS_IF([test "${SYS}" != "mingw32"], [
+- GL_LIBS="-lGL"
++ GL_LIBS="-lGLESv2"
+ ], [
+ GL_LIBS="-lopengl32"
+ ])
+@@ -3483,15 +3490,14 @@ AC_ARG_ENABLE(mmal_avcodec,
+ [Use MMAL enabled avcodec libs (default disable)]))
+ if test "${enable_mmal}" != "no"; then
+ VLC_SAVE_FLAGS
+- LDFLAGS="${LDFLAGS} -L/opt/vc/lib -lvchostif"
+- CPPFLAGS="${CPPFLAGS} -isystem /opt/vc/include -isystem /opt/vc/include/interface/vcos/pthreads -isystem /opt/vc/include/interface/vmcs_host/linux"
+- AC_CHECK_HEADERS(interface/mmal/mmal.h,
+- [ AC_CHECK_LIB(bcm_host, vc_tv_unregister_callback_full, [
++ PKG_CHECK_MODULES(BCMHOST_MMAL, [bcm_host mmal vcsm openmaxil egl], [
++ HAVE_MMAL=yes
++ AC_CHECK_HEADERS(interface/mmal/mmal.h,
++ [ AC_CHECK_LIB(bcm_host $BCMHOST_MMAL_LIBS, vc_tv_unregister_callback_full, [
+ have_mmal="yes"
+- VLC_ADD_PLUGIN([mmal])
+- VLC_ADD_LDFLAGS([mmal],[ -L/opt/vc/lib ])
+- VLC_ADD_CFLAGS([mmal],[ -isystem /opt/vc/include -isystem /opt/vc/include/interface/vcos/pthreads -isystem /opt/vc/include/interface/vmcs_host/linux ])
+- VLC_ADD_LIBS([mmal],[ -lbcm_host -lmmal -lmmal_core -lmmal_components -lmmal_util -lvchostif -lvchiq_arm -lvcsm ]) ], [
++ VLC_ADD_PLUGIN([bcm_host mmal vcsm openmaxil egl])
++ VLC_ADD_CFLAGS([bcm_host mmal vcsm openmaxil egl],[$BCMHOST_MMAL_CFLAGS])
++ VLC_ADD_LIBS([bcm_host mmal vcsm openmaxil egl],[$BCMHOST_MMAL_LIBS -lmmal_components]) ], [
+ AS_IF([test "${enable_mmal}" = "yes"],
+ [ AC_MSG_ERROR([Cannot find bcm library...]) ],
+ [ AC_MSG_WARN([Cannot find bcm library...]) ])
+@@ -3500,6 +3506,10 @@ if test "${enable_mmal}" != "no"; then
+ ] , [ AS_IF([test "${enable_mmal}" = "yes"],
+ [ AC_MSG_ERROR([Cannot find development headers for mmal...]) ],
+ [ AC_MSG_WARN([Cannot find development headers for mmal...]) ]) ])
++ ],:[
++ AC_MSG_WARN([${BCMHOST_PKG_ERRORS}: userland graphics not available.])
++ HAVE_MMAL=NO
++ ])
+ VLC_RESTORE_FLAGS
+ fi
+ AM_CONDITIONAL([HAVE_MMAL], [test "${have_mmal}" = "yes"])
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch
new file mode 100644
index 0000000..ab72b4f
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch
@@ -0,0 +1,61 @@
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 07 Jan 2022 07:01:47 PM CST
+Subject: [PATCH] Fix EGL macro undeclared and EGLImageKHR
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+* Fixes compiler issues related to EGL macro constant/enum value type not being defined
+* Updates EGLImage to EGLImageKHR
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+diff --git a/modules/hw/mmal/converter_mmal.c b/modules/hw/mmal/converter_mmal.c
+index f31cb81d8..426af668b 100644
+--- a/modules/hw/mmal/converter_mmal.c
++++ b/modules/hw/mmal/converter_mmal.c
+@@ -28,6 +28,34 @@
+
+ #define TRACE_ALL 0
+
++// Pass Yocto related build errors
++#define EGL_LINUX_DMA_BUF_EXT 0x3270
++#define EGL_LINUX_DRM_FOURCC_EXT 0x3271
++#define EGL_DMA_BUF_PLANE0_FD_EXT 0x3272
++#define EGL_DMA_BUF_PLANE0_OFFSET_EXT 0x3273
++#define EGL_DMA_BUF_PLANE0_PITCH_EXT 0x3274
++#define EGL_DMA_BUF_PLANE1_FD_EXT 0x3275
++#define EGL_DMA_BUF_PLANE1_OFFSET_EXT 0x3276
++#define EGL_DMA_BUF_PLANE1_PITCH_EXT 0x3277
++#define EGL_DMA_BUF_PLANE2_FD_EXT 0x3278
++#define EGL_DMA_BUF_PLANE2_OFFSET_EXT 0x3279
++#define EGL_DMA_BUF_PLANE2_PITCH_EXT 0x327A
++#define EGL_YUV_COLOR_SPACE_HINT_EXT 0x327B
++#define EGL_SAMPLE_RANGE_HINT_EXT 0x327C
++#define EGL_YUV_CHROMA_HORIZONTAL_SITING_HINT_EXT 0x327D
++#define EGL_YUV_CHROMA_VERTICAL_SITING_HINT_EXT 0x327E
++#define EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT 0x3443
++#define EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT 0x3444
++#define EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT 0x3445
++#define EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT 0x3446
++#define EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT 0x3447
++#define EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT 0x3448
++#define EGL_DMA_BUF_PLANE3_FD_EXT 0x3440
++#define EGL_DMA_BUF_PLANE3_OFFSET_EXT 0x3441
++#define EGL_DMA_BUF_PLANE3_PITCH_EXT 0x3442
++#define EGL_DMA_BUF_PLANE3_MODIFIER_LO_EXT 0x3449
++#define EGL_DMA_BUF_PLANE3_MODIFIER_HI_EXT 0x344A
++
+ typedef struct mmal_gl_converter_s
+ {
+ EGLint drm_fourcc;
+@@ -199,7 +227,7 @@ static tex_context_t * get_tex_context(const opengl_tex_converter_t * const tc,
+
+ *a = EGL_NONE;
+
+- const EGLImage image = tc->gl->egl.createImageKHR(tc->gl, EGL_LINUX_DMA_BUF_EXT, NULL, attribs);
++ const EGLImageKHR image = tc->gl->egl.createImageKHR(tc->gl, EGL_LINUX_DMA_BUF_EXT, NULL, attribs);
+ if (!image) {
+ msg_Err(tc, "Failed to import fd %d: Err=%#x", fd, tc->vt->GetError());
+ goto fail;
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch
new file mode 100644
index 0000000..a2dba50
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch
@@ -0,0 +1,43 @@
+From 85f6603aca1d174848b42e696a4cff8af57613d6 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 23:38:36 -0600
+Subject: [PATCH] codec: omxil_core replace /opt/vc path with /usr/lib
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original VLC and applies patches to enable
+raspiberry pi support.
+
+Configures omxil_core.c for OE usages as libbcm_host.so
+and libopenmaxil.so are located in a different location.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/codec/omxil/omxil_core.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/modules/codec/omxil/omxil_core.c b/modules/codec/omxil/omxil_core.c
+index 5098f517a..5922d9034 100644
+--- a/modules/codec/omxil/omxil_core.c
++++ b/modules/codec/omxil/omxil_core.c
+@@ -56,7 +56,7 @@ static const char *ppsz_dll_list[] =
+ #if defined(USE_IOMX)
+ "libiomx.so", /* Not used when using IOMX, the lib should already be loaded */
+ #elif defined(RPI_OMX)
+- "/opt/vc/lib/libopenmaxil.so", /* Broadcom IL core */
++ "/usr/lib/libopenmaxil.so", /* Broadcom IL core */
+ #elif 1
+ "libOMX_Core.so", /* TI OMAP IL core */
+ "libOmxCore.so", /* Qualcomm IL core */
+@@ -70,7 +70,7 @@ static const char *ppsz_dll_list[] =
+ #ifdef RPI_OMX
+ static const char *ppsz_extra_dll_list[] =
+ {
+- "/opt/vc/lib/libbcm_host.so", /* Broadcom host library */
++ "/usr/lib/libbcm_host.so", /* Broadcom host library */
+ 0
+ };
+ #endif
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3004-use-GLESv2-headers-over-GL-headers.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3004-use-GLESv2-headers-over-GL-headers.patch
new file mode 100644
index 0000000..8016ab3
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3004-use-GLESv2-headers-over-GL-headers.patch
@@ -0,0 +1,60 @@
+From 377a67af6c3f7c38f6f7ba24f042ba1a6cfd3f24 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 00:21:43 -0600
+Subject: [PATCH] use GLESv2 headers over GL headers
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+We utilize GLESv2 during compilation. Patches ensures
+we utilize headers for it.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/video_output/opengl/converter.h | 12 +++---------
+ modules/visualization/glspectrum.c | 4 +++-
+ 2 files changed, 6 insertions(+), 10 deletions(-)
+
+diff --git a/modules/video_output/opengl/converter.h b/modules/video_output/opengl/converter.h
+index 7000e1f38..a3fe32671 100644
+--- a/modules/video_output/opengl/converter.h
++++ b/modules/video_output/opengl/converter.h
+@@ -41,15 +41,9 @@
+ # include <OpenGLES/ES2/glext.h>
+ # endif
+ #else /* !defined (__APPLE__) */
+-# if defined (USE_OPENGL_ES2)
+-# include <GLES2/gl2.h>
+-# include <GLES2/gl2ext.h>
+-# else
+-# ifdef _WIN32
+-# include <GL/glew.h>
+-# endif
+-# include <GL/gl.h>
+-# endif
++#define USE_OPENGL_ES2
++#include <GLES2/gl2.h>
++#include <GLES2/gl2ext.h>
+ #endif
+
+ #define VLCGL_PICTURE_MAX 128
+diff --git a/modules/visualization/glspectrum.c b/modules/visualization/glspectrum.c
+index 06f8d1bdf..470080b1a 100644
+--- a/modules/visualization/glspectrum.c
++++ b/modules/visualization/glspectrum.c
+@@ -37,7 +37,9 @@
+ #ifdef __APPLE__
+ # include <OpenGL/gl.h>
+ #else
+-# include <GL/gl.h>
++#define USE_OPENGL_ES2
++#include <GLES2/gl2.h>
++#include <GLES2/gl2ext.h>
+ #endif
+
+ #include <math.h>
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3005-modules-remove-glspectrum-usage.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3005-modules-remove-glspectrum-usage.patch
new file mode 100644
index 0000000..7cf210b
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3005-modules-remove-glspectrum-usage.patch
@@ -0,0 +1,149 @@
+From 5f1bb5889d838719e381350b25c00ef3a75d0e02 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 01:07:55 -0600
+Subject: [PATCH] modules: remove glspectrum usage
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+The glspectrum modules requries OpenGL
+while we only want to utilize GLESv2.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/Makefile.in | 24 ------------------------
+ modules/visualization/Makefile.am | 10 ----------
+ 2 files changed, 34 deletions(-)
+
+diff --git a/modules/Makefile.in b/modules/Makefile.in
+index bde45db53..c9c4342ad 100644
+--- a/modules/Makefile.in
++++ b/modules/Makefile.in
+@@ -481,7 +481,6 @@ TESTS = hpack_test$(EXEEXT) hpackenc_test$(EXEEXT) \
+ @HAVE_WIN32_FALSE@am__append_247 = $(X_LIBS) $(X_PRE_LIBS) -lX11
+ @HAVE_DARWIN_FALSE@@HAVE_WIN32_FALSE@am__append_248 = $(X_LIBS) $(X_PRE_LIBS) -lX11
+ @HAVE_EVAS_TRUE@am__append_249 = libevas_plugin.la
+-@HAVE_GL_TRUE@am__append_250 = libglspectrum_plugin.la
+ @ENABLE_SOUT_TRUE@@HAVE_GCRYPT_TRUE@am__append_251 = libaccess_output_livehttp_plugin.la
+ @ENABLE_SOUT_TRUE@am__append_252 = libaccess_output_shout_plugin.la \
+ @ENABLE_SOUT_TRUE@ libaccess_output_srt_plugin.la \
+@@ -2028,13 +2027,7 @@ libgles2_plugin_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+ $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+ $(libgles2_plugin_la_CFLAGS) $(CFLAGS) \
+ $(libgles2_plugin_la_LDFLAGS) $(LDFLAGS) -o $@
+-libglspectrum_plugin_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
+ $(am__DEPENDENCIES_1)
+-am_libglspectrum_plugin_la_OBJECTS = visualization/glspectrum.lo \
+- visualization/visual/fft.lo visualization/visual/window.lo
+-libglspectrum_plugin_la_OBJECTS = \
+- $(am_libglspectrum_plugin_la_OBJECTS)
+-@HAVE_GL_TRUE@am_libglspectrum_plugin_la_rpath = -rpath $(visudir)
+ libglwin32_plugin_la_DEPENDENCIES = libchroma_copy.la \
+ $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_5)
+ am__objects_23 = \
+@@ -6507,7 +6500,6 @@ am__depfiles_remade = \
+ video_splitter/$(DEPDIR)/clone.Plo \
+ video_splitter/$(DEPDIR)/libpanoramix_plugin_la-panoramix.Plo \
+ video_splitter/$(DEPDIR)/wall.Plo \
+- visualization/$(DEPDIR)/glspectrum.Plo \
+ visualization/$(DEPDIR)/libgoom_plugin_la-goom.Plo \
+ visualization/$(DEPDIR)/libprojectm_plugin_la-projectm.Plo \
+ visualization/$(DEPDIR)/libvsxu_plugin_la-vsxu.Plo \
+@@ -6731,7 +6723,6 @@ SOURCES = $(liba52_plugin_la_SOURCES) $(libaa_plugin_la_SOURCES) \
+ $(libglconv_vaapi_x11_plugin_la_SOURCES) \
+ $(libglconv_vdpau_plugin_la_SOURCES) \
+ $(libgles2_plugin_la_SOURCES) \
+- $(libglspectrum_plugin_la_SOURCES) \
+ $(libglwin32_plugin_la_SOURCES) $(libglx_plugin_la_SOURCES) \
+ $(libgme_plugin_la_SOURCES) $(libgnutls_plugin_la_SOURCES) \
+ $(libgoom_plugin_la_SOURCES) $(libgradfun_plugin_la_SOURCES) \
+@@ -7130,7 +7121,6 @@ DIST_SOURCES = $(liba52_plugin_la_SOURCES) $(libaa_plugin_la_SOURCES) \
+ $(libglconv_vaapi_x11_plugin_la_SOURCES) \
+ $(libglconv_vdpau_plugin_la_SOURCES) \
+ $(libgles2_plugin_la_SOURCES) \
+- $(libglspectrum_plugin_la_SOURCES) \
+ $(libglwin32_plugin_la_SOURCES) $(libglx_plugin_la_SOURCES) \
+ $(libgme_plugin_la_SOURCES) $(libgnutls_plugin_la_SOURCES) \
+ $(libgoom_plugin_la_SOURCES) $(libgradfun_plugin_la_SOURCES) \
+@@ -12696,13 +12686,6 @@ libevent_thread_la_LDFLAGS = -static
+ visudir = $(pluginsdir)/visualization
+ visu_LTLIBRARIES = $(am__append_250) $(LTLIBgoom) $(LTLIBprojectm) \
+ libvisual_plugin.la $(LTLIBvsxu)
+-libglspectrum_plugin_la_SOURCES = \
+- visualization/glspectrum.c \
+- visualization/visual/fft.c visualization/visual/fft.h \
+- visualization/visual/window.c visualization/visual/window.h \
+- visualization/visual/window_presets.h
+-
+-libglspectrum_plugin_la_LIBADD = $(GL_LIBS) $(LIBM)
+ libgoom_plugin_la_SOURCES = visualization/goom.c
+ libgoom_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) $(GOOM_CFLAGS)
+ libgoom_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(visudir)'
+@@ -15715,8 +15698,6 @@ visualization/$(am__dirstamp):
+ visualization/$(DEPDIR)/$(am__dirstamp):
+ @$(MKDIR_P) visualization/$(DEPDIR)
+ @: > visualization/$(DEPDIR)/$(am__dirstamp)
+-visualization/glspectrum.lo: visualization/$(am__dirstamp) \
+- visualization/$(DEPDIR)/$(am__dirstamp)
+ visualization/visual/$(am__dirstamp):
+ @$(MKDIR_P) visualization/visual
+ @: > visualization/visual/$(am__dirstamp)
+@@ -15728,8 +15709,6 @@ visualization/visual/fft.lo: visualization/visual/$(am__dirstamp) \
+ visualization/visual/window.lo: visualization/visual/$(am__dirstamp) \
+ visualization/visual/$(DEPDIR)/$(am__dirstamp)
+
+-libglspectrum_plugin.la: $(libglspectrum_plugin_la_OBJECTS) $(libglspectrum_plugin_la_DEPENDENCIES) $(EXTRA_libglspectrum_plugin_la_DEPENDENCIES)
+- $(AM_V_CCLD)$(LINK) $(am_libglspectrum_plugin_la_rpath) $(libglspectrum_plugin_la_OBJECTS) $(libglspectrum_plugin_la_LIBADD) $(LIBS)
+ video_output/opengl/libglwin32_plugin_la-vout_helper.lo: \
+ video_output/opengl/$(am__dirstamp) \
+ video_output/opengl/$(DEPDIR)/$(am__dirstamp)
+@@ -21420,7 +21399,6 @@ distclean-compile:
+ @AMDEP_TRUE@@am__include@ @am__quote@video_splitter/$(DEPDIR)/clone.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@video_splitter/$(DEPDIR)/libpanoramix_plugin_la-panoramix.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@video_splitter/$(DEPDIR)/wall.Plo@am__quote@ # am--include-marker
+-@AMDEP_TRUE@@am__include@ @am__quote@visualization/$(DEPDIR)/glspectrum.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@visualization/$(DEPDIR)/libgoom_plugin_la-goom.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@visualization/$(DEPDIR)/libprojectm_plugin_la-projectm.Plo@am__quote@ # am--include-marker
+ @AMDEP_TRUE@@am__include@ @am__quote@visualization/$(DEPDIR)/libvsxu_plugin_la-vsxu.Plo@am__quote@ # am--include-marker
+@@ -30324,7 +30302,6 @@ distclean: distclean-recursive
+ -rm -f video_splitter/$(DEPDIR)/clone.Plo
+ -rm -f video_splitter/$(DEPDIR)/libpanoramix_plugin_la-panoramix.Plo
+ -rm -f video_splitter/$(DEPDIR)/wall.Plo
+- -rm -f visualization/$(DEPDIR)/glspectrum.Plo
+ -rm -f visualization/$(DEPDIR)/libgoom_plugin_la-goom.Plo
+ -rm -f visualization/$(DEPDIR)/libprojectm_plugin_la-projectm.Plo
+ -rm -f visualization/$(DEPDIR)/libvsxu_plugin_la-vsxu.Plo
+@@ -31722,7 +31699,6 @@ maintainer-clean: maintainer-clean-recursive
+ -rm -f video_splitter/$(DEPDIR)/clone.Plo
+ -rm -f video_splitter/$(DEPDIR)/libpanoramix_plugin_la-panoramix.Plo
+ -rm -f video_splitter/$(DEPDIR)/wall.Plo
+- -rm -f visualization/$(DEPDIR)/glspectrum.Plo
+ -rm -f visualization/$(DEPDIR)/libgoom_plugin_la-goom.Plo
+ -rm -f visualization/$(DEPDIR)/libprojectm_plugin_la-projectm.Plo
+ -rm -f visualization/$(DEPDIR)/libvsxu_plugin_la-vsxu.Plo
+diff --git a/modules/visualization/Makefile.am b/modules/visualization/Makefile.am
+index 10619e030..aafc97f87 100644
+--- a/modules/visualization/Makefile.am
++++ b/modules/visualization/Makefile.am
+@@ -1,16 +1,6 @@
+ visudir = $(pluginsdir)/visualization
+ visu_LTLIBRARIES =
+
+-libglspectrum_plugin_la_SOURCES = \
+- visualization/glspectrum.c \
+- visualization/visual/fft.c visualization/visual/fft.h \
+- visualization/visual/window.c visualization/visual/window.h \
+- visualization/visual/window_presets.h
+-libglspectrum_plugin_la_LIBADD = $(GL_LIBS) $(LIBM)
+-if HAVE_GL
+-visu_LTLIBRARIES += libglspectrum_plugin.la
+-endif
+-
+ libgoom_plugin_la_SOURCES = visualization/goom.c
+ libgoom_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) $(GOOM_CFLAGS)
+ libgoom_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(visudir)'
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3006-codec-omxil_core.h-fix-multiple-definition-of.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3006-codec-omxil_core.h-fix-multiple-definition-of.patch
new file mode 100644
index 0000000..e680c88
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3006-codec-omxil_core.h-fix-multiple-definition-of.patch
@@ -0,0 +1,43 @@
+From fd4d233757cc46cd89f68b45ec4b059940dd84ae Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 19:58:11 -0600
+Subject: [PATCH] codec: omxil_core.h fix multiple definition of
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+Issue occurs during compilation as
+* pf_enable_graphic_buffers
+* pf_get_graphic_buffer_usage
+* pf_get_hal_format
+
+Apears to be defined multiple times as the omxil_core.h
+is included in multiple files.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/codec/omxil/omxil_core.h | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/modules/codec/omxil/omxil_core.h b/modules/codec/omxil/omxil_core.h
+index ac3db510b..f6e42f5ed 100644
+--- a/modules/codec/omxil/omxil_core.h
++++ b/modules/codec/omxil/omxil_core.h
+@@ -34,9 +34,9 @@ extern OMX_ERRORTYPE (*pf_component_enum)(OMX_STRING, OMX_U32, OMX_U32);
+ extern OMX_ERRORTYPE (*pf_get_roles_of_component)(OMX_STRING, OMX_U32 *, OMX_U8 **);
+
+ /* Extra IOMX android functions. Can be NULL if we don't link with libiomx */
+-OMX_ERRORTYPE (*pf_enable_graphic_buffers)(OMX_HANDLETYPE, OMX_U32, OMX_BOOL);
+-OMX_ERRORTYPE (*pf_get_graphic_buffer_usage)(OMX_HANDLETYPE, OMX_U32, OMX_U32*);
+-OMX_ERRORTYPE (*pf_get_hal_format) (const char *, int *);
++extern OMX_ERRORTYPE (*pf_enable_graphic_buffers)(OMX_HANDLETYPE, OMX_U32, OMX_BOOL);
++extern OMX_ERRORTYPE (*pf_get_graphic_buffer_usage)(OMX_HANDLETYPE, OMX_U32, OMX_U32*);
++extern OMX_ERRORTYPE (*pf_get_hal_format) (const char *, int *);
+
+ int InitOmxCore(vlc_object_t *p_this);
+ void DeinitOmxCore(void);
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3007-remove-xorg-related-link-libs.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3007-remove-xorg-related-link-libs.patch
new file mode 100644
index 0000000..a0487fa
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3007-remove-xorg-related-link-libs.patch
@@ -0,0 +1,36 @@
+From 34e4f4dad923095989ccb0ab8efb883c592bdbfd Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 20:04:27 -0600
+Subject: [PATCH] remove xorg related link libs
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+If x11 isn't defined in DISTRO_FEATURES
+required xorg related libs are not included
+in recipe-sysroot resulting in compilation
+failure.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/hw/mmal/Makefile.am | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/modules/hw/mmal/Makefile.am b/modules/hw/mmal/Makefile.am
+index 4abe68e2e..86dad2c2d 100644
+--- a/modules/hw/mmal/Makefile.am
++++ b/modules/hw/mmal/Makefile.am
+@@ -8,7 +8,7 @@ libmmal_vout_plugin_la_SOURCES = vout.c mmal_cma.c mmal_picture.c subpic.c\
+ mmal_cma.h mmal_picture.h subpic.h transform_ops.h\
+ mmal_piccpy_neon.S
+ libmmal_vout_plugin_la_CFLAGS = $(AM_CFLAGS)
+-libmmal_vout_plugin_la_LDFLAGS = $(AM_LDFLAGS) -lm -lX11 -lXrandr
++libmmal_vout_plugin_la_LDFLAGS = $(AM_LDFLAGS) -lm
+ libmmal_vout_plugin_la_LIBADD = $(LIBS_mmal)
+ mmal_LTLIBRARIES = libmmal_vout_plugin.la
+
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3008-vo-Makefile.am-exclude-libgl_plugin.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3008-vo-Makefile.am-exclude-libgl_plugin.patch
new file mode 100644
index 0000000..8806c80
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3008-vo-Makefile.am-exclude-libgl_plugin.patch
@@ -0,0 +1,97 @@
+From 28917a258a4173af0abda0eef7faef5cbf95f123 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 21:28:48 -0600
+Subject: [PATCH] vo: Makefile.am exclude libgl_plugin
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches
+to enable raspiberry pi support.
+
+In the situation where opengl isn't included in
+DISTRO_FEATURES. We need to exclude the opengl
+vout plugin from being built.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/video_output/Makefile.am | 64 --------------------------------
+ 1 file changed, 64 deletions(-)
+
+diff --git a/modules/video_output/Makefile.am b/modules/video_output/Makefile.am
+index 78c06cfc4..14a330e68 100644
+--- a/modules/video_output/Makefile.am
++++ b/modules/video_output/Makefile.am
+@@ -57,70 +57,6 @@ if HAVE_TVOS
+ vout_LTLIBRARIES += libvout_ios_plugin.la libglconv_cvpx_plugin.la
+ endif
+
+-### OpenGL ###
+-libgles2_plugin_la_SOURCES = $(OPENGL_COMMONSOURCES) video_output/opengl/display.c
+-libgles2_plugin_la_CFLAGS = $(AM_CFLAGS) $(GLES2_CFLAGS) -DUSE_OPENGL_ES2 $(OPENGL_COMMONCLFAGS)
+-libgles2_plugin_la_LIBADD = $(GLES2_LIBS) $(LIBM) $(OPENGL_COMMONLIBS)
+-libgles2_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(voutdir)'
+-
+-EXTRA_LTLIBRARIES += libgles2_plugin.la
+-vout_LTLIBRARIES += $(LTLIBgles2)
+-
+-libgl_plugin_la_SOURCES = $(OPENGL_COMMONSOURCES) video_output/opengl/display.c
+-libgl_plugin_la_CFLAGS = $(AM_CFLAGS) $(GL_CFLAGS) $(OPENGL_COMMONCLFAGS)
+-libgl_plugin_la_LIBADD = $(LIBM) $(OPENGL_COMMONLIBS)
+-if HAVE_WIN32
+-libgl_plugin_la_CFLAGS += -DHAVE_GL_CORE_SYMBOLS
+-libgl_plugin_la_LIBADD += $(GL_LIBS)
+-endif
+-
+-libglconv_vaapi_wl_plugin_la_SOURCES = video_output/opengl/converter_vaapi.c \
+- video_output/opengl/converter.h \
+- hw/vaapi/vlc_vaapi.c hw/vaapi/vlc_vaapi.h
+-libglconv_vaapi_wl_plugin_la_CFLAGS = $(AM_CFLAGS) $(GL_CFLAGS) -DHAVE_VA_WL $(LIBVA_WL_CFLAGS)
+-libglconv_vaapi_wl_plugin_la_LIBADD = $(LIBVA_LIBS) $(LIBVA_EGL_LIBS) \
+- $(LIBVA_WL_LIBS)
+-
+-libglconv_vaapi_x11_plugin_la_SOURCES = $(libglconv_vaapi_wl_plugin_la_SOURCES)
+-libglconv_vaapi_x11_plugin_la_CFLAGS = $(AM_CFLAGS) -DHAVE_VA_X11
+-libglconv_vaapi_x11_plugin_la_LIBADD = $(LIBVA_LIBS) $(LIBVA_EGL_LIBS) \
+- $(LIBVA_X11_LIBS) $(X_LIBS) $(X_PRE_LIBS) -lX11
+-
+-libglconv_vaapi_drm_plugin_la_SOURCES = $(libglconv_vaapi_wl_plugin_la_SOURCES)
+-libglconv_vaapi_drm_plugin_la_CFLAGS = $(AM_CFLAGS) -DHAVE_VA_DRM
+-libglconv_vaapi_drm_plugin_la_LIBADD = $(LIBVA_LIBS) $(LIBVA_EGL_LIBS) \
+- $(LIBVA_DRM_LIBS)
+-
+-libglconv_vdpau_plugin_la_SOURCES = video_output/opengl/converter_vdpau.c \
+- video_output/opengl/converter.h hw/vdpau/vlc_vdpau.h
+-libglconv_vdpau_plugin_la_CFLAGS = $(AM_CFLAGS) $(VDPAU_CFLAGS)
+-libglconv_vdpau_plugin_la_LIBADD = $(LIBDL) libvlc_vdpau.la $(X_LIBS) $(X_PRE_LIBS) -lX11
+-
+-if HAVE_GL
+-vout_LTLIBRARIES += libgl_plugin.la
+-if HAVE_EGL
+-if HAVE_VAAPI
+-if HAVE_WAYLAND_EGL
+-if HAVE_VAAPI_WL
+-vout_LTLIBRARIES += libglconv_vaapi_wl_plugin.la
+-endif
+-endif
+-if HAVE_XCB
+-if HAVE_VAAPI_X11
+-vout_LTLIBRARIES += libglconv_vaapi_x11_plugin.la
+-endif
+-endif
+-if HAVE_VAAPI_DRM
+-vout_LTLIBRARIES += libglconv_vaapi_drm_plugin.la
+-endif
+-endif
+-endif # HAVE_EGL
+-
+-if HAVE_VDPAU
+-vout_LTLIBRARIES += libglconv_vdpau_plugin.la
+-endif
+-endif # HAVE_GL
+-
+ ### XCB ###
+ libvlc_xcb_events_la_SOURCES = \
+ video_output/xcb/events.c video_output/xcb/events.h
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch
new file mode 100644
index 0000000..0f28199
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch
@@ -0,0 +1,59 @@
+From 35276c4b02b9114436108e74727d192f1e21f239 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Fri, 9 Dec 2022 23:31:33 -0600
+Subject: [PATCH] vo: converter_vaapi Fix EGL macro undeclared
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo forks original vlc and applies patches to enable
+raspiberry pi support.
+
+Fixes compiler issues related to EGL macro constant/enum value type
+not being defined
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ modules/video_output/opengl/converter_vaapi.c | 27 +++++++++++++++++++
+ 1 file changed, 27 insertions(+)
+
+diff --git a/modules/video_output/opengl/converter_vaapi.c b/modules/video_output/opengl/converter_vaapi.c
+index cd842f711..59245fe4c 100644
+--- a/modules/video_output/opengl/converter_vaapi.c
++++ b/modules/video_output/opengl/converter_vaapi.c
+@@ -55,6 +55,33 @@
+
+ #define DRM_FORMAT_MOD_INVALID fourcc_mod_code(NONE, DRM_FORMAT_RESERVED)
+
++#define EGL_LINUX_DMA_BUF_EXT 0x3270
++#define EGL_LINUX_DRM_FOURCC_EXT 0x3271
++#define EGL_DMA_BUF_PLANE0_FD_EXT 0x3272
++#define EGL_DMA_BUF_PLANE0_OFFSET_EXT 0x3273
++#define EGL_DMA_BUF_PLANE0_PITCH_EXT 0x3274
++#define EGL_DMA_BUF_PLANE1_FD_EXT 0x3275
++#define EGL_DMA_BUF_PLANE1_OFFSET_EXT 0x3276
++#define EGL_DMA_BUF_PLANE1_PITCH_EXT 0x3277
++#define EGL_DMA_BUF_PLANE2_FD_EXT 0x3278
++#define EGL_DMA_BUF_PLANE2_OFFSET_EXT 0x3279
++#define EGL_DMA_BUF_PLANE2_PITCH_EXT 0x327A
++#define EGL_YUV_COLOR_SPACE_HINT_EXT 0x327B
++#define EGL_SAMPLE_RANGE_HINT_EXT 0x327C
++#define EGL_YUV_CHROMA_HORIZONTAL_SITING_HINT_EXT 0x327D
++#define EGL_YUV_CHROMA_VERTICAL_SITING_HINT_EXT 0x327E
++#define EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT 0x3443
++#define EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT 0x3444
++#define EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT 0x3445
++#define EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT 0x3446
++#define EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT 0x3447
++#define EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT 0x3448
++#define EGL_DMA_BUF_PLANE3_FD_EXT 0x3440
++#define EGL_DMA_BUF_PLANE3_OFFSET_EXT 0x3441
++#define EGL_DMA_BUF_PLANE3_PITCH_EXT 0x3442
++#define EGL_DMA_BUF_PLANE3_MODIFIER_LO_EXT 0x3449
++#define EGL_DMA_BUF_PLANE3_MODIFIER_HI_EXT 0x344A
++
+ struct priv
+ {
+ struct vlc_vaapi_instance *vainst;
+--
+2.38.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch
new file mode 100644
index 0000000..acfb39a
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/files/3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch
@@ -0,0 +1,59 @@
+From 4caba7560aec54f6d944accd1a8d216e8d9b1d92 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Tue, 14 Nov 2023 20:17:11 -0500
+Subject: [PATCH] po: Fix typos in oc.po for gettext compatibility
+
+Upstream-Status: Inappropriate
+
+Ws moved upstream, but upstream patch couldn't be applied.
+
+https://code.videolan.org/videolan/vlc/-/commit/9d67e20c2edd25251b46d1780a7973b44ac5e5ba
+
+gettext-0.22 became stricter and started to validate format strings. Fix
+the typos.
+
+Bug: https://bugs.gentoo.org/909015
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ po/oc.po | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/po/oc.po b/po/oc.po
+index 86f2ed8a1..ce68c581f 100644
+--- a/po/oc.po
++++ b/po/oc.po
+@@ -5298,18 +5298,18 @@ msgstr "Comanda+"
+ #: src/misc/update.c:482
+ #, c-format
+ msgid "%.1f GiB"
+-msgstr "%.lf Gio"
++msgstr "%.1f Gio"
+
+ #: src/misc/update.c:484
+ #, c-format
+ msgid "%.1f MiB"
+-msgstr "%.lf Mio"
++msgstr "%.1f Mio"
+
+ #: src/misc/update.c:486 modules/gui/macosx/VLCPlaylistInfo.m:138
+ #: modules/gui/macosx/VLCPlaylistInfo.m:140
+ #, c-format
+ msgid "%.1f KiB"
+-msgstr "%.lf Kio"
++msgstr "%.1f Kio"
+
+ #: src/misc/update.c:488
+ #, c-format
+@@ -33071,7 +33071,7 @@ msgstr "Lista del gestionari de mèdias"
+
+ #, fuzzy
+ #~ msgid "%.1f kB"
+-#~ msgstr "%.lf Gio"
++#~ msgstr "%.1f Gio"
+
+ #, fuzzy
+ #~ msgid "Speed"
+--
+2.34.1
+
diff --git a/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/rpidistro-vlc_3.0.17.bb b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/rpidistro-vlc_3.0.17.bb
new file mode 100644
index 0000000..2007201
--- /dev/null
+++ b/dynamic-layers/multimedia-layer/recipes-multimedia/rpidistro-vlc/rpidistro-vlc_3.0.17.bb
@@ -0,0 +1,165 @@
+DESCRIPTION = "Video player and streamer - davinci edition"
+HOMEPAGE = "http://www.videolan.org"
+SECTION = "multimedia"
+
+LICENSE = "GPL-2.0-only"
+LIC_FILES_CHKSUM = "file://COPYING;md5=b234ee4d69f5fce4486a80fdaf4a4263"
+
+SRC_URI = "\
+ git://git@github.com/RPi-Distro/vlc;protocol=https;branch=buster-rpt \
+ file://0001-configure-fix-linking-on-RISC-V-ISA.patch \
+ file://0002-Revert-configure-Require-libmodplug-0.8.9.patch \
+ file://0003-CVE-2022-41325.patch \
+ file://0004-mmal_20.patch \
+ file://0005-mmal_exit_fix.patch \
+ file://0006-mmal_chain.patch \
+ file://0007-armv6.patch \
+ file://0008-configure-Disable-incompatible-function-pointer-type.patch \
+ file://0009-demux-dash-include-cstdint-needed-for-uint64_t.patch \
+ file://2001-fix-luaL-checkint.patch \
+ file://2002-use-vorbisidec.patch \
+ file://3001-configure.ac-setup-for-OE-usage.patch \
+ file://3002-fix-EGL-macro-undeclared-and-EGLImageKHR.patch \
+ file://3003-codec-omxil_core-replace-opt-vc-path-with-usr-lib.patch \
+ file://3004-use-GLESv2-headers-over-GL-headers.patch \
+ file://3005-modules-remove-glspectrum-usage.patch \
+ file://3006-codec-omxil_core.h-fix-multiple-definition-of.patch \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'x11', '', 'file://3007-remove-xorg-related-link-libs.patch', d)} \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', '', 'file://3008-vo-Makefile.am-exclude-libgl_plugin.patch', d)} \
+ file://3009-vo-converter_vaapi-Fix-EGL-macro-undeclared.patch \
+ file://3010-po-Fix-typos-in-oc.po-for-gettext-compatibility.patch \
+ "
+
+SRCREV = "b276eb0d7bc3213363e97dbb681ef7c927be6c73"
+
+S = "${WORKDIR}/git"
+
+PROVIDES = "vlc"
+RPROVIDES:${PN} = "${PROVIDES}"
+DEPENDS = "coreutils-native fribidi libtool libgcrypt libgcrypt-native \
+ dbus libxml2 gnutls tremor faad2 ffmpeg flac alsa-lib libidn \
+ jpeg xz libmodplug mpeg2dec libmtp libopus orc libsamplerate0 \
+ avahi libusb1 schroedinger taglib tiff"
+
+inherit autotools gettext pkgconfig mime-xdg
+
+export BUILDCC = "${BUILD_CC} -std=c11"
+EXTRA_OECONF = "\
+ --enable-run-as-root \
+ --enable-xvideo \
+ --disable-lua \
+ --disable-screen \
+ --disable-caca \
+ --enable-vlm \
+ --enable-tremor \
+ --disable-aa \
+ --disable-faad \
+ --enable-dbus \
+ --without-contrib \
+ --without-kde-solid \
+ --enable-realrtsp \
+ --disable-libtar \
+ --enable-avcodec \
+ --disable-css \
+ "
+
+PACKAGECONFIG ?= "\
+ ${@bb.utils.contains('DISTRO_FEATURES', 'x11', 'x11', '', d)} \
+ ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mmal', d)} \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', 'gles2', '', d)} \
+ ${@bb.utils.contains_any('DISTRO_FEATURES', 'x11', 'notify', '', d)} \
+ live555 dv1394 fontconfig fluidsynth freetype png udev \
+ x264 alsa harfbuzz jack neon fribidi dvbpsi a52 v4l2 \
+ "
+
+PACKAGECONFIG[mmal] = "--enable-omxil --enable-omxil-vout --enable-rpi-omxil --enable-mmal --enable-mmal-avcodec,,userland"
+PACKAGECONFIG[x264] = "--enable-x264,--disable-x264,x264"
+PACKAGECONFIG[mad] = "--enable-mad,--disable-mad,libmad"
+PACKAGECONFIG[a52] = "--enable-a52,--disable-a52,liba52"
+PACKAGECONFIG[jack] = "--enable-jack,--disable-jack,jack"
+PACKAGECONFIG[live555] = "--enable-live555 LIVE555_PREFIX=${STAGING_DIR_HOST}${prefix},--disable-live555,live555"
+PACKAGECONFIG[libass] = "--enable-libass,--disable-libass,libass"
+PACKAGECONFIG[postproc] = "--enable-postproc,--disable-postproc,libpostproc"
+PACKAGECONFIG[libva] = "--enable-libva,--disable-libva,libva"
+#PACKAGECONFIG[opencv] = "--enable-opencv,--disable-opencv,opencv"
+PACKAGECONFIG[speex] = "--enable-speex,--disable-speex,speex"
+PACKAGECONFIG[gstreamer] = "--enable-gst-decode,--disable-gst-decode,gstreamer1.0 gstreamer1.0-plugins-base gstreamer1.0-plugins-bad"
+PACKAGECONFIG[vpx] = "--enable-vpx,--disable-vpx, libvpx"
+#PACKAGECONFIG[freerdp] = "--enable-freerdp,--disable-freerdp, freerdp"
+PACKAGECONFIG[dvbpsi] = "--enable-dvbpsi,--disable-dvbpsi, libdvbpsi"
+#PACKAGECONFIG[samba] = "--enable-smbclient,--disable-smbclient, samba"
+PACKAGECONFIG[upnp] = "--enable-upnp,--disable-upnp,libupnp"
+PACKAGECONFIG[dvdnav] = "--enable-dvdnav,--disable-dvdnav,libdvdnav libdvdcss"
+PACKAGECONFIG[sftp] = "--enable-sftp,--disable-sftp,libssh2"
+PACKAGECONFIG[vorbis] = "--enable-vorbis,--disable-vorbis,libvorbis libogg"
+PACKAGECONFIG[ogg] = "--enable-ogg,--disable-ogg,libvorbis libogg"
+PACKAGECONFIG[dc1394] = "--enable-dc1394,--disable-dc1394,libdc1394"
+PACKAGECONFIG[dv1394] = "--enable-dv1394,--disable-dv1394,libraw1394 libavc1394"
+PACKAGECONFIG[svg] = "--enable-svg,--disable-svg,librsvg"
+PACKAGECONFIG[svgdec] = "--enable-svgdec,--disable-svgdec,librsvg cairo"
+PACKAGECONFIG[notify] = "--enable-notify,--disable-notify, libnotify gtk+3"
+PACKAGECONFIG[fontconfig] = "--enable-fontconfig,--disable-fontconfig, fontconfig"
+PACKAGECONFIG[freetype] = "--enable-freetype,--disable-freetype, freetype"
+#PACKAGECONFIG[dvdread] = "--enable-dvdread,--disable-dvdread, libdvdread libdvdcss"
+PACKAGECONFIG[vnc] = "--enable-vnc,--disable-vnc, libvncserver"
+PACKAGECONFIG[x11] = "--with-x --enable-xcb,--without-x --disable-xcb, xcb-util-keysyms libxpm libxinerama"
+PACKAGECONFIG[png] = "--enable-png,--disable-png,libpng"
+#PACKAGECONFIG[vdpau] = "--enable-vdpau,--disable-vdpau,libvdpau"
+#PACKAGECONFIG[wayland] = "--enable-wayland,--disable-wayland,wayland wayland-native"
+PACKAGECONFIG[gles2] = "--enable-gles2,--disable-gles2,virtual/libgles2"
+#PACKAGECONFIG[dca] = "--enable-dca,--disable-dca,libdca"
+PACKAGECONFIG[fribidi] = "--enable-fribidi,,fribidi"
+PACKAGECONFIG[gnutls] = "--enable-gnutls,,gnutls"
+PACKAGECONFIG[fluidsynth] = "--enable-fluidsynth,,fluidsynth"
+PACKAGECONFIG[harfbuzz] = "--enable-harfbuzz,--disable-harfbuzz,harfbuzz"
+PACKAGECONFIG[udev] = "--enable-udev,--disable-udev,udev"
+PACKAGECONFIG[neon] = "--enable-neon,--disable-neon,"
+PACKAGECONFIG[opus] = "--enable-opus,--disable-opus,libopus libogg"
+PACKAGECONFIG[ncurses] = "--enable-ncurses,--disable-ncurses,ncurses"
+PACKAGECONFIG[alsa] = "--enable-alsa,--disable-alsa,alsa-lib"
+PACKAGECONFIG[pulseaudio] = "--enable-pulse,--disable-pulse,pulseaudio"
+PACKAGECONFIG[sdl-image] = "--enable-sdl-image,,libsdl-image"
+PACKAGECONFIG[v4l2] = "--enable-v4l2,,v4l-utils"
+
+TARGET_CFLAGS:append = " -I${STAGING_INCDIR}/drm"
+TARGET_LDFLAGS:append = " ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', '-lGLESv2', '', d)}"
+
+# Ensures the --enable-mmal-avcodec flag is available for usage
+do_configure:prepend() {
+ olddir=`pwd`
+ cd ${S}
+ ./bootstrap
+ cd $olddir
+}
+
+# This recipe packages vlc as a library as well, so qt4 dependencies
+# can be avoided when only the library is installed.
+PACKAGES =+ "libvlc"
+
+LEAD_SONAME_libvlc = "libvlc.so.5"
+FILES:libvlc = "${libdir}/lib*.so.*"
+
+FILES:${PN} += "\
+ ${bindir}/vlc \
+ ${libdir}/vlc \
+ ${datadir}/applications \
+ ${datadir}/vlc \
+ ${datadir}/icons \
+ ${datadir}/metainfo/vlc.appdata.xml \
+ "
+
+FILES:${PN}-dbg += "\
+ ${libdir}/vlc/*/.debug \
+ ${libdir}/vlc/plugins/*/.debug \
+ "
+
+FILES:${PN}-staticdev += "\
+ ${libdir}/vlc/plugins/*/*.a \
+ ${libdir}/vlc/libcompat.a \
+ "
+
+# Only enable it for rpi class of machines
+COMPATIBLE_HOST = "null"
+COMPATIBLE_HOST:rpi = "(.*)"
+
+INSANE_SKIP:${PN} = "dev-so"
diff --git a/dynamic-layers/networking-layer/recipes-support/drbd/drbd_%.bbappend b/dynamic-layers/networking-layer/recipes-support/drbd/drbd_%.bbappend
index 933aaf6..d15f34c 100644
--- a/dynamic-layers/networking-layer/recipes-support/drbd/drbd_%.bbappend
+++ b/dynamic-layers/networking-layer/recipes-support/drbd/drbd_%.bbappend
@@ -1,3 +1,3 @@
-FILESEXTRAPATHS_prepend := "${THISDIR}/${PN}:"
+FILESEXTRAPATHS:prepend := "${THISDIR}/${PN}:"
-COMPATIBLE_MACHINE_rpi = "(null)"
+COMPATIBLE_MACHINE:rpi = "(null)"
diff --git a/dynamic-layers/openembedded-layer/recipes-core/packagegroups/packagegroup-meta-oe.bbappend b/dynamic-layers/openembedded-layer/recipes-core/packagegroups/packagegroup-meta-oe.bbappend
index 25423d0..8d458c8 100644
--- a/dynamic-layers/openembedded-layer/recipes-core/packagegroups/packagegroup-meta-oe.bbappend
+++ b/dynamic-layers/openembedded-layer/recipes-core/packagegroups/packagegroup-meta-oe.bbappend
@@ -1 +1 @@
-RDEPENDS_packagegroup-meta-oe-kernel_remove_rpi = "bpftool"
+RDEPENDS:packagegroup-meta-oe-kernel:remove:rpi = "bpftool"
diff --git a/recipes-devtools/python/python3-adafruit-blinka_6.2.2.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-blinka_6.2.2.bb
index 7d3120f..9e1e357 100644
--- a/recipes-devtools/python/python3-adafruit-blinka_6.2.2.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-blinka_6.2.2.bb
@@ -3,7 +3,7 @@ HOMEPAGE = "https://github.com/adafruit/Adafruit_Blinka"
LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=660e614bc7efb0697cc793d8a22a55c2"
-SRC_URI = "git://github.com/adafruit/Adafruit_Blinka.git"
+SRC_URI = "git://github.com/adafruit/Adafruit_Blinka.git;branch=main;protocol=https"
SRCREV = "dc688f354fe779c9267c208b99f310af87e79272"
S = "${WORKDIR}/git"
@@ -12,7 +12,7 @@ inherit setuptools3
DEPENDS += "python3-setuptools-scm-native"
-do_install_append() {
+do_install:append() {
# it ships ./bcm283x/pulseio/libgpiod_pulsein which is a prebuilt
# 32bit binary therefore we should make this specific to 32bit rpi machines (based on bcm283x) only
if [ ${@bb.utils.contains('TUNE_FEATURES', 'callconvention-hard', '1', '0', d)} = "0" ]; then
@@ -20,11 +20,14 @@ do_install_append() {
fi
}
-RDEPENDS_${PN} += " \
+RDEPENDS:${PN} += " \
libgpiod \
python3-adafruit-platformdetect \
python3-adafruit-pureio \
python3-core \
"
-RDEPENDS_${PN}_append_rpi = " rpi-gpio"
+RDEPENDS:${PN}:append:rpi = " rpi-gpio"
+
+COMPATIBLE_HOST:libc-musl:class-target = "null"
+
diff --git a/recipes-devtools/python/python3-adafruit-circuitpython-busdevice_5.0.5.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-busdevice_5.0.5.bb
index c14d6f3..93491d4 100644
--- a/recipes-devtools/python/python3-adafruit-circuitpython-busdevice_5.0.5.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-busdevice_5.0.5.bb
@@ -3,7 +3,7 @@ HOMEPAGE = "https://github.com/adafruit/Adafruit_CircuitPython_BusDevice"
LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=6ec69d6e9e6c85adfb7799d7f8cf044e"
-SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_BusDevice.git"
+SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_BusDevice.git;branch=main;protocol=https"
SRCREV = "1bfe8005293205e2f7b2cc498ab5a946f1133b40"
S = "${WORKDIR}/git"
@@ -12,7 +12,8 @@ inherit setuptools3
DEPENDS += "python3-setuptools-scm-native"
-RDEPENDS_${PN} += " \
+RDEPENDS:${PN} += " \
python3-adafruit-blinka \
python3-core \
"
+COMPATIBLE_HOST:libc-musl:class-target = "null"
diff --git a/recipes-devtools/python/python3-adafruit-circuitpython-motor_3.2.6.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motor_3.2.6.bb
index e05e2ab..3233c8f 100644
--- a/recipes-devtools/python/python3-adafruit-circuitpython-motor_3.2.6.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motor_3.2.6.bb
@@ -3,7 +3,7 @@ HOMEPAGE = "https://github.com/adafruit/Adafruit_CircuitPython_Motor"
LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=b72678307cc7c10910b5ef460216af07"
-SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_Motor.git"
+SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_Motor.git;branch=main;protocol=https"
SRCREV = "2251bfc0501d0acfb96c0a43f4f2b4c6a10ca14e"
S = "${WORKDIR}/git"
@@ -12,7 +12,8 @@ inherit setuptools3
DEPENDS += "python3-setuptools-scm-native"
-RDEPENDS_${PN} += " \
+RDEPENDS:${PN} += " \
python3-adafruit-blinka \
python3-core \
"
+COMPATIBLE_HOST:libc-musl:class-target = "null"
diff --git a/recipes-devtools/python/python3-adafruit-circuitpython-motorkit_1.6.1.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motorkit_1.6.1.bb
index f35d48c..39fe76a 100644
--- a/recipes-devtools/python/python3-adafruit-circuitpython-motorkit_1.6.1.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-motorkit_1.6.1.bb
@@ -3,7 +3,7 @@ HOMEPAGE = "https://github.com/adafruit/Adafruit_CircuitPython_MotorKit"
LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=6ad4a8854b39ad474755ef1aea813bac"
-SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_MotorKit.git"
+SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_MotorKit.git;branch=main;protocol=https"
SRCREV = "8c1462b4129b21f6db156d1517abb017bb74b982"
S = "${WORKDIR}/git"
@@ -12,7 +12,7 @@ inherit setuptools3
DEPENDS += "python3-setuptools-scm-native"
-RDEPENDS_${PN} += " \
+RDEPENDS:${PN} += " \
python3-adafruit-blinka \
python3-adafruit-circuitpython-busdevice \
python3-adafruit-circuitpython-motor \
@@ -20,3 +20,4 @@ RDEPENDS_${PN} += " \
python3-adafruit-circuitpython-register \
python3-core \
"
+COMPATIBLE_HOST:libc-musl:class-target = "null"
diff --git a/recipes-devtools/python/python3-adafruit-circuitpython-pca9685_3.3.4.bb b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-pca9685_3.3.4.bb
index 0b65c81..f7f0ff1 100644
--- a/recipes-devtools/python/python3-adafruit-circuitpython-pca9685_3.3.4.bb
+++ b/dynamic-layers/openembedded-layer/recipes-devtools/python/python3-adafruit-circuitpython-pca9685_3.3.4.bb
@@ -3,7 +3,7 @@ HOMEPAGE = "https://github.com/adafruit/Adafruit_CircuitPython_PCA9685"
LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=e7eb6b599fb0cfb06485c64cd4242f62"
-SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_PCA9685.git"
+SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_PCA9685.git;branch=main;protocol=https"
SRCREV = "2780c4102f4c23fbab252aa1198b61ba7e2d1b2c"
S = "${WORKDIR}/git"
@@ -12,9 +12,10 @@ inherit setuptools3
DEPENDS += "python3-setuptools-scm-native"
-RDEPENDS_${PN} += " \
+RDEPENDS:${PN} += " \
python3-adafruit-blinka \
python3-adafruit-circuitpython-busdevice \
python3-adafruit-circuitpython-register \
python3-core \
"
+COMPATIBLE_HOST:libc-musl:class-target = "null"
diff --git a/dynamic-layers/qt5-layer/recipes-qt/qt5/qtbase_%.bbappend b/dynamic-layers/qt5-layer/recipes-qt/qt5/qtbase_%.bbappend
index e48f31d..6d3de3f 100644
--- a/dynamic-layers/qt5-layer/recipes-qt/qt5/qtbase_%.bbappend
+++ b/dynamic-layers/qt5-layer/recipes-qt/qt5/qtbase_%.bbappend
@@ -1,19 +1,19 @@
-PACKAGECONFIG_GL_rpi = "${@bb.utils.contains('DISTRO_FEATURES', 'x11 opengl', 'gl', \
+PACKAGECONFIG_GL:rpi = "${@bb.utils.contains('DISTRO_FEATURES', 'x11 opengl', 'gl', \
bb.utils.contains('DISTRO_FEATURES', 'opengl', 'eglfs gles2', \
'', d), d)}"
-PACKAGECONFIG_GL_append_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', ' kms', '', d)}"
-PACKAGECONFIG_GL_append_rpi = " gbm"
-PACKAGECONFIG_FONTS_rpi = "fontconfig"
-PACKAGECONFIG_append_rpi = " libinput examples tslib xkbcommon"
-PACKAGECONFIG_remove_rpi = "tests"
+PACKAGECONFIG_GL:append:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', ' kms', '', d)}"
+PACKAGECONFIG_GL:append:rpi = " gbm"
+PACKAGECONFIG_FONTS:rpi = "fontconfig"
+PACKAGECONFIG:append:rpi = " libinput examples tslib xkbcommon"
+PACKAGECONFIG:remove:rpi = "tests"
-OE_QTBASE_EGLFS_DEVICE_INTEGRATION_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'eglfs_brcm', d)}"
+OE_QTBASE_EGLFS_DEVICE_INTEGRATION:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'eglfs_brcm', d)}"
-do_configure_prepend_rpi() {
+do_configure:prepend:rpi() {
# Add the appropriate EGLFS_DEVICE_INTEGRATION
if [ "${@d.getVar('OE_QTBASE_EGLFS_DEVICE_INTEGRATION')}" != "" ]; then
echo "EGLFS_DEVICE_INTEGRATION = ${OE_QTBASE_EGLFS_DEVICE_INTEGRATION}" >> ${S}/mkspecs/oe-device-extra.pri
fi
}
-RDEPENDS_${PN}_append_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' userland', d)}"
-DEPENDS_append_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' userland', d)}"
+RDEPENDS:${PN}:append:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' userland', d)}"
+DEPENDS:append:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' userland', d)}"
diff --git a/img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.png b/img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.png
new file mode 100644
index 0000000..7d009bb
--- /dev/null
+++ b/img/LF_17_02_Yocto-Badge-Update_Compatible_Final_Blank.png
Binary files differ
diff --git a/img/balena.png b/img/balena.png
index a872ce9..324c35a 100644
--- a/img/balena.png
+++ b/img/balena.png
Binary files differ
diff --git a/kas-poky-rpi.yml b/kas-poky-rpi.yml
index 3e43e2e..ce59eca 100644
--- a/kas-poky-rpi.yml
+++ b/kas-poky-rpi.yml
@@ -42,10 +42,10 @@ local_conf_header:
reduce_diskspace: |
INHERIT += "rm_work_and_downloads"
standard: |
- CONF_VERSION = "1"
+ CONF_VERSION = "2"
PACKAGE_CLASSES = "package_rpm"
SDKMACHINE = "x86_64"
- USER_CLASSES = "buildstats image-prelink"
+ USER_CLASSES = "buildstats"
PATCHRESOLVE = "noop"
debug-tweaks: |
EXTRA_IMAGE_FEATURES = "debug-tweaks"
@@ -55,7 +55,7 @@ local_conf_header:
STOPTASKS,${DL_DIR},1G,100K \
STOPTASKS,${SSTATE_DIR},1G,100K \
STOPTASKS,/tmp,100M,100K \
- ABORT,${TMPDIR},100M,1K \
- ABORT,${DL_DIR},100M,1K \
- ABORT,${SSTATE_DIR},100M,1K \
- ABORT,/tmp,10M,1K"
+ HALT,${TMPDIR},100M,1K \
+ HALT,${DL_DIR},100M,1K \
+ HALT,${SSTATE_DIR},100M,1K \
+ HALT,/tmp,10M,1K"
diff --git a/lib/oeqa/runtime/cases/parselogs_rpi.py b/lib/oeqa/runtime/cases/parselogs_rpi.py
index 4a94310..49226ae 100644
--- a/lib/oeqa/runtime/cases/parselogs_rpi.py
+++ b/lib/oeqa/runtime/cases/parselogs_rpi.py
@@ -1,12 +1,6 @@
from oeqa.runtime.cases.parselogs import *
rpi_errors = [
- 'bcmgenet fd580000.genet: failed to get enet-eee clock',
- 'bcmgenet fd580000.genet: failed to get enet-wol clock',
- 'bcmgenet fd580000.genet: failed to get enet clock',
- 'bcmgenet fd580000.ethernet: failed to get enet-eee clock',
- 'bcmgenet fd580000.ethernet: failed to get enet-wol clock',
- 'bcmgenet fd580000.ethernet: failed to get enet clock',
]
ignore_errors['raspberrypi4'] = rpi_errors + common_errors
diff --git a/recipes-bsp/bootfiles/rpi-bootfiles.bb b/recipes-bsp/bootfiles/rpi-bootfiles.bb
index f1248ee..b04f24b 100644
--- a/recipes-bsp/bootfiles/rpi-bootfiles.bb
+++ b/recipes-bsp/bootfiles/rpi-bootfiles.bb
@@ -5,7 +5,16 @@ LIC_FILES_CHKSUM = "file://LICENCE.broadcom;md5=c403841ff2837657b2ed8e5bb474ac8d
inherit deploy nopackages
-include recipes-bsp/common/raspberrypi-firmware.inc
+RPIFW_DATE ?= "20240319"
+SRCREV = "9f24f4bc2bdd07ffd158cfbb4bce88a2efc4c1f5"
+SHORTREV = "${@d.getVar("SRCREV", False).__str__()[:7]}"
+RPIFW_SRC_URI ?= "https://api.github.com/repos/raspberrypi/firmware/tarball/9f24f4bc2bdd07ffd158cfbb4bce88a2efc4c1f5;downloadfilename=raspberrypi-firmware-${SHORTREV}.tar.gz"
+RPIFW_S ?= "${WORKDIR}/raspberrypi-firmware-${SHORTREV}"
+
+SRC_URI = "${RPIFW_SRC_URI}"
+SRC_URI[sha256sum] = "4b436f8946b139c6a1202375ef55d4848e3bcd8c1a9cb47000e06d7ecec828f7"
+
+PV = "${RPIFW_DATE}"
INHIBIT_DEFAULT_DEPS = "1"
diff --git a/recipes-bsp/bootfiles/rpi-cmdline.bb b/recipes-bsp/bootfiles/rpi-cmdline.bb
index 3ebd1e6..a22f50d 100644
--- a/recipes-bsp/bootfiles/rpi-cmdline.bb
+++ b/recipes-bsp/bootfiles/rpi-cmdline.bb
@@ -7,13 +7,13 @@ INHIBIT_DEFAULT_DEPS = "1"
inherit deploy nopackages
CMDLINE_DWC_OTG ?= "dwc_otg.lpm_enable=0"
-CMDLINE_ROOTFS ?= "root=/dev/mmcblk0p2 rootfstype=ext4 rootwait"
-CMDLINE_SERIAL ?= "${@oe.utils.conditional("ENABLE_UART", "1", "console=serial0,115200", "", d)}"
+CMDLINE_ROOT_FSTYPE ?= "rootfstype=ext4"
+CMDLINE_ROOT_PARTITION ?= "/dev/mmcblk0p2"
-CMDLINE_CMA ?= "${@oe.utils.conditional("RASPBERRYPI_CAMERA_V2", "1", "cma=64M", "", d)}"
+CMDLINE_ROOTFS ?= "root=${CMDLINE_ROOT_PARTITION} ${CMDLINE_ROOT_FSTYPE} rootwait"
-CMDLINE_CMA ?= "${@oe.utils.conditional("RASPBERRYPI_HD_CAMERA", "1", "cma=64M", "", d)}"
+CMDLINE_SERIAL ?= "${@oe.utils.conditional("ENABLE_UART", "1", "console=serial0,115200", "", d)}"
CMDLINE_PITFT ?= "${@bb.utils.contains("MACHINE_FEATURES", "pitft", "fbcon=map:10 fbcon=font:VGA8x8", "", d)}"
@@ -27,7 +27,27 @@ CMDLINE_LOGO ?= '${@oe.utils.conditional("DISABLE_RPI_BOOT_LOGO", "1", "logo.nol
# to enable kernel debugging.
CMDLINE_DEBUG ?= ""
+# Add a request to isolate processors from the Linux scheduler. ISOLATED_CPUS
+# may have the form of a comma separated list of processor numbers "0,1,3", a
+# range "0-2", a combination of the two "0-1,3", or a single processor you may
+# not specify ALL processors simultaneously
+def setup_isolcpus(d):
+ string = ""
+ if d.getVar('ISOLATED_CPUS'):
+ string = 'isolcpus=' + d.getVar('ISOLATED_CPUS')
+ return string
+
+CMDLINE_ISOL_CPUS ?= "${@setup_isolcpus(d)}"
+
+# Add RNDIS capabilities (must be after rootwait)
+# example:
+# CMDLINE_RNDIS = "modules-load=dwc2,g_ether g_ether.host_addr=<some MAC
+# address> g_ether.dev_addr=<some MAC address>"
+# if the MAC addresses are omitted, random values will be used
+CMDLINE_RNDIS ?= ""
+
CMDLINE = " \
+ ${CMDLINE_ISOL_CPUS} \
${CMDLINE_DWC_OTG} \
${CMDLINE_SERIAL} \
${CMDLINE_ROOTFS} \
@@ -36,10 +56,11 @@ CMDLINE = " \
${CMDLINE_LOGO} \
${CMDLINE_PITFT} \
${CMDLINE_DEBUG} \
+ ${CMDLINE_RNDIS} \
"
do_compile() {
- echo "${@' '.join('${CMDLINE}'.split())}" > "${WORKDIR}/cmdline.txt"
+ echo "${@' '.join(d.getVar('CMDLINE').split())}" > "${WORKDIR}/cmdline.txt"
}
do_deploy() {
@@ -49,3 +70,5 @@ do_deploy() {
addtask deploy before do_build after do_install
do_deploy[dirs] += "${DEPLOYDIR}/${BOOTFILES_DIR_NAME}"
+
+PACKAGE_ARCH = "${MACHINE_ARCH}"
diff --git a/recipes-bsp/bootfiles/rpi-config/0001-config.txt-reintroduce-start_x.patch b/recipes-bsp/bootfiles/rpi-config/0001-config.txt-reintroduce-start_x.patch
new file mode 100644
index 0000000..c6c51c9
--- /dev/null
+++ b/recipes-bsp/bootfiles/rpi-config/0001-config.txt-reintroduce-start_x.patch
@@ -0,0 +1,55 @@
+From ce27f7e22b2cd7453a425e08780a338a71301961 Mon Sep 17 00:00:00 2001
+From: Leon Anavi <leon.anavi@konsulko.com>
+Date: Mon, 20 Nov 2023 15:19:15 +0200
+Subject: [PATCH] config.txt: reintroduce start_x
+
+Reintroduce configuration "start_x". Based on the experience with
+Yocto/OpenEmbedded layer meta-raspberrypi, it has been observed
+that Raspberry Pi 4B 4GB may fail to enable the camera if
+"start_x=1" is at the end of the file. Therefore, "start_x=1"
+is expected in config.txt template and it has been set to replace
+the original occurrence, which is at the middle of the file.
+Also update revision and date stamp.
+
+GitHub pull request: https://github.com/Evilpaul/RPi-config/pull/8
+
+Upstream-Status: Submitted
+
+Signed-off-by: Leon Anavi <leon.anavi@konsulko.com>
+---
+ config.txt | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/config.txt b/config.txt
+index 1cf7b29..e28ed02 100644
+--- a/config.txt
++++ b/config.txt
+@@ -1,7 +1,7 @@
+ ################################################################################
+ ## Raspberry Pi Configuration Settings
+ ##
+-## Revision 17, 2021/08/15
++## Revision 18, 2023/11/20
+ ##
+ ## Details taken from the eLinux wiki and official Raspberry Pi documentation.
+ ## For up-to-date information please refer to links below.
+@@ -760,6 +760,16 @@
+ ## Camera Settings
+ ################################################################################
+
++## start_x
++## Set to "1" to enable the camera module.
++##
++## Enabling the camera requires gpu_mem option to be specified with a value
++## of at least 128.
++##
++## Default 0
++##
++#start_x=0
++
+ ## disable_camera_led
+ ## Turn off the red camera led when recording video or taking a still
+ ## picture.
+--
+2.39.2
+
diff --git a/recipes-bsp/bootfiles/rpi-config_git.bb b/recipes-bsp/bootfiles/rpi-config_git.bb
index 052206a..b91668f 100644
--- a/recipes-bsp/bootfiles/rpi-config_git.bb
+++ b/recipes-bsp/bootfiles/rpi-config_git.bb
@@ -7,8 +7,9 @@ LIC_FILES_CHKSUM = "file://${COMMON_LICENSE_DIR}/MIT;md5=0835ade698e0bcf8506ecda
COMPATIBLE_MACHINE = "^rpi$"
-SRCREV = "648ffc470824c43eb0d16c485f4c24816b32cd6f"
-SRC_URI = "git://github.com/Evilpaul/RPi-config.git;protocol=git;branch=master \
+SRCREV = "6ac2d832c6c3b208e2669f50ec1abf2c20cb7ff4"
+SRC_URI = "git://github.com/Evilpaul/RPi-config.git;protocol=https;branch=master \
+ file://0001-config.txt-reintroduce-start_x.patch \
"
S = "${WORKDIR}/git"
@@ -29,6 +30,14 @@ GPIO_IR ?= "18"
GPIO_IR_TX ?= "17"
CAN_OSCILLATOR ?= "16000000"
+CAN0_INTERRUPT_PIN ?= "25"
+CAN1_INTERRUPT_PIN ?= "24"
+
+ENABLE_UART ??= ""
+
+WM8960="${@bb.utils.contains("MACHINE_FEATURES", "wm8960", "1", "0", d)}"
+
+GPIO_SHUTDOWN_PIN ??= ""
inherit deploy nopackages
@@ -103,6 +112,9 @@ do_deploy() {
if [ -n "${HDMI_MODE}" ]; then
sed -i '/#hdmi_mode=/ c\hdmi_mode=${HDMI_MODE}' $CONFIG
fi
+ if [ -n "${HDMI_CVT}" ]; then
+ echo 'hdmi_cvt=${HDMI_CVT}' >> $CONFIG
+ fi
if [ -n "${CONFIG_HDMI_BOOST}" ]; then
sed -i '/#config_hdmi_boost=/ c\config_hdmi_boost=${CONFIG_HDMI_BOOST}' $CONFIG
fi
@@ -169,9 +181,25 @@ do_deploy() {
fi
# UART support
- if [ "${ENABLE_UART}" = "1" ]; then
+ if [ "${ENABLE_UART}" = "1" ] || [ "${ENABLE_UART}" = "0" ]; then
echo "# Enable UART" >>$CONFIG
- echo "enable_uart=1" >>$CONFIG
+ echo "enable_uart=${ENABLE_UART}" >>$CONFIG
+ elif [ -n "${ENABLE_UART}" ]; then
+ bbfatal "Invalid value for ENABLE_UART [${ENABLE_UART}]. The value for ENABLE_UART can be 0 or 1."
+ fi
+
+ # U-Boot requires "enable_uart=1" for various boards to operate correctly
+ # cf https://source.denx.de/u-boot/u-boot/-/blob/v2023.04/arch/arm/mach-bcm283x/Kconfig?ref_type=tags#L65
+ if [ "${RPI_USE_U_BOOT}" = "1" ] && [ "${ENABLE_UART}" != "1" ]; then
+ case "${UBOOT_MACHINE}" in
+ rpi_0_w_defconfig|rpi_3_32b_config|rpi_4_32b_config|rpi_arm64_config)
+ if [ "${ENABLE_UART}" = "0" ]; then
+ bbfatal "Invalid configuration: RPI_USE_U_BOOT requires to enable the UART in config.txt for ${MACHINE}"
+ fi
+ echo "# U-Boot requires UART" >>$CONFIG
+ echo "enable_uart=1" >>$CONFIG
+ ;;
+ esac
fi
# Infrared support
@@ -199,6 +227,12 @@ do_deploy() {
# echo "dtoverlay=imx477" >> $CONFIG
#fi
+ # Choose Camera Sensor to be used, default imx708 sensor
+ if [ "${RASPBERRYPI_CAMERA_V3}" = "1" ]; then
+ echo "# Enable Sony RaspberryPi Camera(imx708)" >> $CONFIG
+ echo "dtoverlay=imx708" >> $CONFIG
+ fi
+
# Waveshare "C" 1024x600 7" Rev2.1 IPS capacitive touch (http://www.waveshare.com/7inch-HDMI-LCD-C.htm)
if [ "${WAVESHARE_1024X600_C_2_1}" = "1" ]; then
echo "# Waveshare \"C\" 1024x600 7\" Rev2.1 IPS capacitive touch screen" >> $CONFIG
@@ -210,7 +244,7 @@ do_deploy() {
fi
# DWC2 USB peripheral support
- if [ "${ENABLE_DWC2_PERIPHERAL}" = "1" ]; then
+ if ([ "${ENABLE_DWC2_PERIPHERAL}" = "1" ] && [ "${ENABLE_DWC2_OTG}" != "1" ]); then
echo "# Enable USB peripheral mode" >> $CONFIG
echo "dtoverlay=dwc2,dr_mode=peripheral" >> $CONFIG
fi
@@ -220,6 +254,12 @@ do_deploy() {
echo "# Enable USB host mode" >> $CONFIG
echo "dtoverlay=dwc2,dr_mode=host" >> $CONFIG
fi
+
+ # DWC2 USB OTG support
+ if ([ "${ENABLE_DWC2_OTG}" = "1" ] && [ "${ENABLE_DWC2_PERIPHERAL}" != "1" ]); then
+ echo "# Enable USB OTG mode" >> $CONFIG
+ echo "dtoverlay=dwc2,dr_mode=otg" >> $CONFIG
+ fi
# AT86RF23X support
if [ "${ENABLE_AT86RF}" = "1" ]; then
@@ -230,12 +270,28 @@ do_deploy() {
# ENABLE DUAL CAN
if [ "${ENABLE_DUAL_CAN}" = "1" ]; then
echo "# Enable DUAL CAN" >>$CONFIG
- echo "dtoverlay=mcp2515-can0,oscillator=${CAN_OSCILLATOR},interrupt=25" >>$CONFIG
- echo "dtoverlay=mcp2515-can1,oscillator=${CAN_OSCILLATOR},interrupt=24" >>$CONFIG
+ echo "dtoverlay=mcp2515-can0,oscillator=${CAN_OSCILLATOR},interrupt=${CAN0_INTERRUPT_PIN}" >>$CONFIG
+ echo "dtoverlay=mcp2515-can1,oscillator=${CAN_OSCILLATOR},interrupt=${CAN1_INTERRUPT_PIN}" >>$CONFIG
# ENABLE CAN
elif [ "${ENABLE_CAN}" = "1" ]; then
echo "# Enable CAN" >>$CONFIG
- echo "dtoverlay=mcp2515-can0,oscillator=${CAN_OSCILLATOR},interrupt=25" >>$CONFIG
+ echo "dtoverlay=mcp2515-can0,oscillator=${CAN_OSCILLATOR},interrupt=${CAN0_INTERRUPT_PIN}" >>$CONFIG
+ fi
+
+
+ if [ "${ENABLE_GPIO_SHUTDOWN}" = "1" ]; then
+ if ([ "${ENABLE_I2C}" = "1" ] || [ "${PITFT}" = "1" ]) && [ -z "${GPIO_SHUTDOWN_PIN}" ]; then
+ # By default GPIO shutdown uses the same pin as the (master) I2C SCL.
+ # If I2C is configured and an alternative pin is not configured for
+ # gpio-shutdown, there is a configuration conflict.
+ bbfatal "I2C and gpio-shutdown are both enabled and using the same pins!"
+ fi
+ echo "# Enable gpio-shutdown" >> $CONFIG
+ if [ -z "${GPIO_SHUTDOWN_PIN}" ]; then
+ echo "dtoverlay=gpio-shutdown" >> $CONFIG
+ else
+ echo "dtoverlay=gpio-shutdown,gpio_pin=${GPIO_SHUTDOWN_PIN}" >> $CONFIG
+ fi
fi
# Append extra config if the user has provided any
@@ -251,9 +307,28 @@ do_deploy() {
;;
esac
fi
+
+ # WM8960 support
+ if [ "${WM8960}" = "1" ]; then
+ echo "# Enable WM8960" >> $CONFIG
+ echo "dtoverlay=wm8960-soundcard" >> $CONFIG
+ fi
+
+ # W1-GPIO - One-Wire Interface
+ if [ "${ENABLE_W1}" = "1" ]; then
+ echo "# Enable One-Wire Interface" >> $CONFIG
+ echo "dtoverlay=w1-gpio" >> $CONFIG
+ fi
+
+ # Reduce config.txt file size to avoid corruption and
+ # to boot successfully Raspberry Pi 5. The issue has
+ # been reported to related projects:
+ # https://github.com/raspberrypi/firmware/issues/1848
+ # https://github.com/Evilpaul/RPi-config/issues/9
+ sed -i '/^##/d' $CONFIG
}
-do_deploy_append_raspberrypi3-64() {
+do_deploy:append:raspberrypi3-64() {
echo "# have a properly sized image" >> $CONFIG
echo "disable_overscan=1" >> $CONFIG
@@ -261,6 +336,12 @@ do_deploy_append_raspberrypi3-64() {
echo "dtparam=audio=on" >> $CONFIG
}
+do_deploy:append() {
+ if grep -q -E '^.{80}.$' ${DEPLOYDIR}/${BOOTFILES_DIR_NAME}/config.txt; then
+ bbwarn "config.txt contains lines longer than 80 characters, this is not supported"
+ fi
+}
+
addtask deploy before do_build after do_install
do_deploy[dirs] += "${DEPLOYDIR}/${BOOTFILES_DIR_NAME}"
diff --git a/recipes-bsp/common/raspberrypi-firmware.inc b/recipes-bsp/common/raspberrypi-firmware.inc
index 6358fba..311da21 100644
--- a/recipes-bsp/common/raspberrypi-firmware.inc
+++ b/recipes-bsp/common/raspberrypi-firmware.inc
@@ -1,9 +1,9 @@
-RPIFW_DATE ?= "20210421"
-SRCREV ?= "2ac4de4eaac5c1d1b25acec4a5e0a9fdb16f0c91"
-RPIFW_SRC_URI ?= "https://github.com/raspberrypi/firmware/archive/${SRCREV}.tar.gz;downloadfilename=raspberrypi-firmware-${SRCREV}.tar.gz"
-RPIFW_S ?= "${WORKDIR}/firmware-${SRCREV}"
+RPIFW_DATE ?= "20230509~buster"
+
+RPIFW_SRC_URI ?= "https://archive.raspberrypi.com/debian/pool/main/r/raspberrypi-firmware/raspberrypi-firmware_1.${RPIFW_DATE}.orig.tar.xz"
+RPIFW_S ?= "${WORKDIR}/raspberrypi-firmware-1.${RPIFW_DATE}"
SRC_URI = "${RPIFW_SRC_URI}"
-SRC_URI[sha256sum] = "c687aa1b5127a8dc0773e8aefb1f009f24bf71ccb4c9e8b40a1d46cbbb7bee0c"
+SRC_URI[sha256sum] = "1d9eb83111826b708f461101766fd2000d45f1c171ad573936d000f623ca8098"
PV = "${RPIFW_DATE}"
diff --git a/recipes-bsp/common/raspberrypi-tools.inc b/recipes-bsp/common/raspberrypi-tools.inc
index 7879c0a..c88e7e4 100644
--- a/recipes-bsp/common/raspberrypi-tools.inc
+++ b/recipes-bsp/common/raspberrypi-tools.inc
@@ -1,9 +1,11 @@
-RPITOOLS_DATE ?= "20201008"
-SRCREV ?= "fc0e73c13865450e95edd046200e42a6e52d8256"
-RPITOOLS_SRC_URI ?= "https://github.com/raspberrypi/tools/archive/${SRCREV}.tar.gz;downloadfilename=raspberrypi-tools-${SRCREV}.tar.gz"
-RPITOOLS_S ?= "${WORKDIR}/tools-${SRCREV}"
+RPITOOLS_DATE ?= "20220711"
+SRCREV ?= "439b6198a9b340de5998dd14a26a0d9d38a6bcac"
+RPITOOLS_SRC_URI ?= "git://github.com/raspberrypi/tools;protocol=https;branch=master"
+RPITOOLS_S ?= "${WORKDIR}/git"
SRC_URI = "${RPITOOLS_SRC_URI}"
-SRC_URI[sha256sum] = "05217b942150830225e8ee04a8f16b8ecc2ffbbe5dd815541b15333f783e805e"
PV = "${RPITOOLS_DATE}"
+
+BB_GIT_SHALLOW = "1"
+
diff --git a/recipes-bsp/formfactor/formfactor_%.bbappend b/recipes-bsp/formfactor/formfactor_%.bbappend
index 72d991c..4fc41d0 100644
--- a/recipes-bsp/formfactor/formfactor_%.bbappend
+++ b/recipes-bsp/formfactor/formfactor_%.bbappend
@@ -1 +1 @@
-FILESEXTRAPATHS_prepend := "${THISDIR}/${PN}:"
+FILESEXTRAPATHS:prepend := "${THISDIR}/${PN}:"
diff --git a/recipes-bsp/gpio-shutdown/files/bind_gpio_shutdown.tab b/recipes-bsp/gpio-shutdown/files/bind_gpio_shutdown.tab
new file mode 100644
index 0000000..00cc485
--- /dev/null
+++ b/recipes-bsp/gpio-shutdown/files/bind_gpio_shutdown.tab
@@ -0,0 +1,2 @@
+# Action on special keypress (Key Power)
+kb::kbrequest:/sbin/shutdown -t1 -a -h -P now
diff --git a/recipes-bsp/gpio-shutdown/files/gpio-shutdown-keymap.sh b/recipes-bsp/gpio-shutdown/files/gpio-shutdown-keymap.sh
new file mode 100644
index 0000000..bae50da
--- /dev/null
+++ b/recipes-bsp/gpio-shutdown/files/gpio-shutdown-keymap.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+##
+# Bind the gpio-shutdown keycode as Keyboard signal and load it to the
+# keymap during startup.
+##
+case "$1" in
+ start)
+ # Inject the gpio keycode to keymap
+ echo "keycode 116 = KeyboardSignal" | loadkeys
+ ;;
+ *)
+ ;;
+esac
diff --git a/recipes-bsp/gpio-shutdown/gpio-shutdown.bb b/recipes-bsp/gpio-shutdown/gpio-shutdown.bb
new file mode 100644
index 0000000..d690840
--- /dev/null
+++ b/recipes-bsp/gpio-shutdown/gpio-shutdown.bb
@@ -0,0 +1,31 @@
+SUMMARY = "GPIO shutdown bindings for SysV init"
+LICENSE = "GPL-2.0-only"
+LIC_FILES_CHKSUM = "file://${COMMON_LICENSE_DIR}/GPL-2.0-only;md5=801f80980d171dd6425610833a22dbe6"
+
+SRC_URI = "file://bind_gpio_shutdown.tab \
+ file://gpio-shutdown-keymap.sh \
+"
+
+inherit update-rc.d
+
+INITSCRIPT_NAME = "gpio-shutdown-keymap.sh"
+# Run only once during startup
+INITSCRIPT_PARAMS = "start 99 S ."
+
+do_install() {
+ # The files are only needed if using SysV init.
+ if ${@bb.utils.contains('DISTRO_FEATURES', 'sysvinit', 'true', 'false', d)}; then
+ install -d ${D}${sysconfdir} \
+ ${D}${sysconfdir}/inittab.d \
+ ${D}${sysconfdir}/init.d
+
+ install -m 0755 ${WORKDIR}/gpio-shutdown-keymap.sh ${D}${sysconfdir}/init.d/
+ install -m 0755 ${WORKDIR}/bind_gpio_shutdown.tab ${D}${sysconfdir}/inittab.d/
+ elif ${@bb.utils.contains('DISTRO_FEATURES', 'systemd', 'true', 'false', d)}; then
+ # Systemd init does not require any configuration.
+ # Note: cannot have an empty branch, hence the redundant dir install.
+ install -d ${D}${sysconfdir}
+ else
+ bbwarn "Not using sysvinit or systemd. The gpio-shutdown may require additional configuration."
+ fi
+}
diff --git a/recipes-bsp/rpi-eeprom/rpi-eeprom_git.bb b/recipes-bsp/rpi-eeprom/rpi-eeprom_git.bb
new file mode 100644
index 0000000..dd0bfa4
--- /dev/null
+++ b/recipes-bsp/rpi-eeprom/rpi-eeprom_git.bb
@@ -0,0 +1,62 @@
+SUMMARY = "Installation scripts and binaries for the Raspberry Pi 4 EEPROM"
+DESCRIPTION = "This repository contains the rpi4 bootloader and scripts \
+for updating it in the spi eeprom"
+LICENSE = "BSD-3-Clause & Broadcom-RPi"
+LIC_FILES_CHKSUM = "file://LICENSE;md5=f546ed4f47e9d4c1fe954ecc9d3ef4f3"
+
+SRC_URI = " \
+ git://github.com/raspberrypi/rpi-eeprom.git;protocol=https;branch=master \
+"
+
+SRCREV = "759460850c2cb69e19567947a42fbed996e7bf61"
+PV = "v.2024.01.05-2712"
+
+S = "${WORKDIR}/git"
+
+RDEPENDS:${PN} += " \
+ coreutils \
+ python3 \
+ python3-pycryptodomex \
+ openssl \
+ xxd \
+ pciutils \
+"
+
+inherit python3native
+
+do_install() {
+ install -d ${D}${bindir}
+
+ # install executables
+ install -m 0755 ${S}/tools/vl805 ${D}${bindir}
+ install -m 0755 ${S}/rpi-eeprom-update ${D}${bindir}
+ install -m 0755 ${S}/rpi-eeprom-config ${D}${bindir}
+ install -m 0755 ${S}/rpi-eeprom-digest ${D}${bindir}
+
+ # copy firmware files
+ install -d ${D}${base_libdir}/firmware/raspberrypi/bootloader/default
+ install -d ${D}${base_libdir}/firmware/raspberrypi/bootloader/latest
+
+ install -m 644 ${S}/firmware-2711/default/* ${D}${base_libdir}/firmware/raspberrypi/bootloader/default
+ install -m 644 ${S}/firmware-2711/latest/* ${D}${base_libdir}/firmware/raspberrypi/bootloader/latest
+
+ ln -s default ${D}${base_libdir}/firmware/raspberrypi/bootloader/critical
+ ln -s latest ${D}${base_libdir}/firmware/raspberrypi/bootloader/stable
+ ln -s latest ${D}${base_libdir}/firmware/raspberrypi/bootloader/beta
+
+ # copy default config
+ install -d ${D}${sysconfdir}/default
+ install -D ${S}/rpi-eeprom-update-default ${D}${sysconfdir}/default/rpi-eeprom-update
+}
+
+FILES:${PN} += "${base_libdir}/firmware/raspberrypi/bootloader/*"
+
+INHIBIT_PACKAGE_STRIP = "1"
+INHIBIT_PACKAGE_DEBUG_SPLIT = "1"
+
+# vl805 tool sources are not available (yet), as it comes as a precompiled
+# binary only. It has ARM architecture whereas target machine is Aarch64. We
+# need to disable arch check for it otherwise it cannot packed.
+QAPATHTEST[arch] = ""
+
+COMPATIBLE_MACHINE = "raspberrypi4|raspberrypi4-64"
diff --git a/recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in b/recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in
index 627d181..58fd86a 100644
--- a/recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in
+++ b/recipes-bsp/rpi-u-boot-scr/files/boot.cmd.in
@@ -1,4 +1,4 @@
fdt addr ${fdt_addr} && fdt get value bootargs /chosen bootargs
-fatload mmc 0:1 ${kernel_addr_r} @@KERNEL_IMAGETYPE@@
-if test ! -e mmc 0:1 uboot.env; then saveenv; fi;
+fatload @@BOOT_MEDIA@@ 0:1 ${kernel_addr_r} @@KERNEL_IMAGETYPE@@
+if test ! -e @@BOOT_MEDIA@@ 0:1 uboot.env; then saveenv; fi;
@@KERNEL_BOOTCMD@@ ${kernel_addr_r} - ${fdt_addr}
diff --git a/recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb b/recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb
index 9108f71..1dff808 100644
--- a/recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb
+++ b/recipes-bsp/rpi-u-boot-scr/rpi-u-boot-scr.bb
@@ -9,9 +9,12 @@ INHIBIT_DEFAULT_DEPS = "1"
SRC_URI = "file://boot.cmd.in"
+BOOT_MEDIA ?= "mmc"
+
do_compile() {
sed -e 's/@@KERNEL_IMAGETYPE@@/${KERNEL_IMAGETYPE}/' \
-e 's/@@KERNEL_BOOTCMD@@/${KERNEL_BOOTCMD}/' \
+ -e 's/@@BOOT_MEDIA@@/${BOOT_MEDIA}/' \
"${WORKDIR}/boot.cmd.in" > "${WORKDIR}/boot.cmd"
mkimage -A ${UBOOT_ARCH} -T script -C none -n "Boot script" -d "${WORKDIR}/boot.cmd" boot.scr
}
diff --git a/recipes-bsp/u-boot/files/0001-dm-core-Move-ofdata_to_platdata-call-earlier.patch b/recipes-bsp/u-boot/files/0001-dm-core-Move-ofdata_to_platdata-call-earlier.patch
deleted file mode 100644
index 996ad10..0000000
--- a/recipes-bsp/u-boot/files/0001-dm-core-Move-ofdata_to_platdata-call-earlier.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-From 336d86ebd146905cf4384912f4f27699b6e37c72 Mon Sep 17 00:00:00 2001
-From: Simon Glass <sjg@chromium.org>
-Date: Sun, 29 Dec 2019 21:19:17 -0700
-Subject: [PATCH] dm: core: Move ofdata_to_platdata() call earlier
-
-This method is supposed to extract platform data from the device tree. It
-should be done before the device itself is probed. Move it earlier in the
-device_probe() function.
-
-Upstream-Status: Backport
-
-Signed-off-by: Simon Glass <sjg@chromium.org>
----
- drivers/core/device.c | 14 +++++++-------
- 1 file changed, 7 insertions(+), 7 deletions(-)
-
-diff --git a/drivers/core/device.c b/drivers/core/device.c
-index 4e03708..291ff4c 100644
---- a/drivers/core/device.c
-+++ b/drivers/core/device.c
-@@ -375,6 +375,13 @@ int device_probe(struct udevice *dev)
- return 0;
- }
-
-+ if (drv->ofdata_to_platdata &&
-+ (CONFIG_IS_ENABLED(OF_PLATDATA) || dev_has_of_node(dev))) {
-+ ret = drv->ofdata_to_platdata(dev);
-+ if (ret)
-+ goto fail;
-+ }
-+
- seq = uclass_resolve_seq(dev);
- if (seq < 0) {
- ret = seq;
-@@ -411,13 +418,6 @@ int device_probe(struct udevice *dev)
- goto fail;
- }
-
-- if (drv->ofdata_to_platdata &&
-- (CONFIG_IS_ENABLED(OF_PLATDATA) || dev_has_of_node(dev))) {
-- ret = drv->ofdata_to_platdata(dev);
-- if (ret)
-- goto fail;
-- }
--
- /* Only handle devices that have a valid ofnode */
- if (dev_of_valid(dev)) {
- /*
---
-2.7.4
-
diff --git a/recipes-bsp/u-boot/files/0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch b/recipes-bsp/u-boot/files/0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch
new file mode 100644
index 0000000..9ea8f85
--- /dev/null
+++ b/recipes-bsp/u-boot/files/0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch
@@ -0,0 +1,52 @@
+From: Mauro Salvini <m.salvini@koansoftware.com>
+To: u-boot@lists.denx.de
+Subject: [PATCH] rpi: always set fdt_addr with firmware-provided FDT address
+Date: Wed, 12 May 2021 14:39:45 +0200 [thread overview]
+Message-ID: <20210512123945.25649-1-m.salvini@koansoftware.com> (raw)
+
+Raspberry firmware prepares the FDT blob in memory at an address
+that depends on both the memory size and the blob size [1].
+After commit ade243a211d6 ("rpi: passthrough of the firmware provided FDT
+blob") this FDT is passed to kernel through fdt_addr environment variable,
+handled in set_fdt_addr() function in board file.
+
+When u-boot environment is persistently saved, if a change happens
+in loaded FDT (e.g. for a new overlay applied), firmware produces a FDT
+address different from the saved one, but u-boot still use the saved
+one because set_fdt_addr() function does not overwrite the fdt_addr
+variable. So, for example, if there is a script that uses fdt commands for
+e.g. manipulate the bootargs, boot hangs with error
+
+libfdt fdt_check_header(): FDT_ERR_BADMAGIC
+
+Removing the fdt_addr variable in saved environment allows to boot.
+
+With this patch set_fdt_addr() function always overwrite fdt_addr value.
+
+[1] https://www.raspberrypi.org/forums//viewtopic.php?f=107&t=134018
+
+Signed-off-by: Mauro Salvini <m.salvini@koansoftware.com>
+Cc: C?dric Schieli <cschieli@gmail.com>
+Cc: Matthias Brugger <mbrugger@suse.com>
+---
+Upstream-Status: Pending
+
+ board/raspberrypi/rpi/rpi.c | 3 ---
+ 1 file changed, 3 deletions(-)
+
+diff --git a/board/raspberrypi/rpi/rpi.c b/board/raspberrypi/rpi/rpi.c
+index df52a4689f..611013471e 100644
+--- a/board/raspberrypi/rpi/rpi.c
++++ b/board/raspberrypi/rpi/rpi.c
+@@ -318,9 +318,6 @@ static void set_fdtfile(void)
+ */
+ static void set_fdt_addr(void)
+ {
+- if (env_get("fdt_addr"))
+- return;
+-
+ if (fdt_magic(fw_dtb_pointer) != FDT_MAGIC)
+ return;
+
+--
+2.17.1 \ No newline at end of file
diff --git a/recipes-bsp/u-boot/u-boot_%.bbappend b/recipes-bsp/u-boot/u-boot_%.bbappend
index cdfe5bb..78b3e48 100644
--- a/recipes-bsp/u-boot/u-boot_%.bbappend
+++ b/recipes-bsp/u-boot/u-boot_%.bbappend
@@ -1,15 +1,17 @@
-FILESEXTRAPATHS_prepend := "${THISDIR}/files:"
+FILESEXTRAPATHS:prepend := "${THISDIR}/files:"
-SRC_URI_append_rpi = " \
+SRC_URI:append:rpi = " \
file://fw_env.config \
"
-# special fix for raspberrypi-cm3
-SRC_URI_append_raspberrypi-cm3 = " file://0001-dm-core-Move-ofdata_to_platdata-call-earlier.patch"
+SRC_URI:append:rpi = " file://0001-rpi-always-set-fdt_addr-with-firmware-provided-FDT-address.patch"
-DEPENDS_append_rpi = " u-boot-default-script"
+DEPENDS:append:rpi = " u-boot-default-script"
-do_install_append_rpi () {
+do_install:append:rpi () {
install -d ${D}${sysconfdir}
install -m 0644 ${WORKDIR}/fw_env.config ${D}${sysconfdir}/fw_env.config
}
+
+# Temporary avoid Raspberry Pi 5 because U-Boot has not been ported yet
+COMPATIBLE_MACHINE:raspberrypi5 = "(-)"
diff --git a/recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch b/recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch
index 3bc02c4..b019743 100644
--- a/recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch
+++ b/recipes-connectivity/bluez5/bluez5/0001-bcm43xx-Add-bcm43xx-3wire-variant.patch
@@ -1,17 +1,19 @@
-From b4f2b77472aeb967d3a7595e8a965785c7a37c87 Mon Sep 17 00:00:00 2001
+From 8e8321cd597d3d9d342a8a3533ad10751dde5885 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Tue, 16 Feb 2016 16:40:46 +0000
-Subject: [PATCH 1/4] bcm43xx: Add bcm43xx-3wire variant
+Subject: [PATCH] bcm43xx: Add bcm43xx-3wire variant
---
+Upstream-Status: Pending
+
tools/hciattach.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tools/hciattach.c b/tools/hciattach.c
-index 59a76a7..5861d33 100644
+index 276a4e56e..7d01d8b74 100644
--- a/tools/hciattach.c
+++ b/tools/hciattach.c
-@@ -1144,6 +1144,9 @@ struct uart_t uart[] = {
+@@ -1078,6 +1078,9 @@ struct uart_t uart[] = {
{ "bcm43xx", 0x0000, 0x0000, HCI_UART_H4, 115200, 3000000,
FLOW_CTL, DISABLE_PM, NULL, bcm43xx, NULL },
@@ -21,6 +23,3 @@ index 59a76a7..5861d33 100644
{ "ath3k", 0x0000, 0x0000, HCI_UART_ATH3K, 115200, 115200,
FLOW_CTL, DISABLE_PM, NULL, ath3k_ps, ath3k_pm },
---
-1.9.1
-
diff --git a/recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch b/recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch
index 5a0a434..6c13490 100644
--- a/recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch
+++ b/recipes-connectivity/bluez5/bluez5/0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch
@@ -1,17 +1,20 @@
-From e145c9621f976063e5c573db1f2053d906f63427 Mon Sep 17 00:00:00 2001
+From 96e5e5eef04c6c4ae83d4d822a536cfa87605ae2 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Tue, 16 Feb 2016 16:39:09 +0000
-Subject: [PATCH 2/4] bcm43xx: The UART speed must be reset after the firmware download
+Subject: [PATCH] bcm43xx: The UART speed must be reset after the firmware
+ download
---
+Upstream-Status: Pending
+
tools/hciattach_bcm43xx.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/tools/hciattach_bcm43xx.c b/tools/hciattach_bcm43xx.c
-index 81f38cb..0b792e0 100644
+index b89fc1b50..de01a6aea 100644
--- a/tools/hciattach_bcm43xx.c
+++ b/tools/hciattach_bcm43xx.c
-@@ -366,11 +366,8 @@ int bcm43xx_init(int fd, int def_speed, int speed, struct termios *ti,
+@@ -350,11 +350,8 @@ int bcm43xx_init(int fd, int def_speed, int speed, struct termios *ti,
return -1;
if (bcm43xx_locate_patch(FIRMWARE_DIR, chip_name, fw_path)) {
@@ -24,7 +27,7 @@ index 81f38cb..0b792e0 100644
if (bcm43xx_load_firmware(fd, fw_path))
return -1;
-@@ -380,6 +377,7 @@ int bcm43xx_init(int fd, int def_speed, int speed, struct termios *ti,
+@@ -364,6 +361,7 @@ int bcm43xx_init(int fd, int def_speed, int speed, struct termios *ti,
return -1;
}
@@ -32,6 +35,3 @@ index 81f38cb..0b792e0 100644
if (bcm43xx_reset(fd))
return -1;
}
---
-1.9.1
-
diff --git a/recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch b/recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch
index f9f09eb..1529023 100644
--- a/recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch
+++ b/recipes-connectivity/bluez5/bluez5/0003-Increase-firmware-load-timeout-to-30s.patch
@@ -1,17 +1,19 @@
-From d41dc2046dd08d8c95197f677e224506f5b39bdd Mon Sep 17 00:00:00 2001
+From 05c3e145b5aa62e7e759932ea99f94d495b651c3 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Wed, 20 Jan 2016 16:00:37 +0000
-Subject: [PATCH 3/4] Increase firmware load timeout to 30s
+Subject: [PATCH] Increase firmware load timeout to 30s
---
+Upstream-Status: Pending
+
tools/hciattach.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/hciattach.c b/tools/hciattach.c
-index 5861d33..4141796 100644
+index 7d01d8b74..465bb17dd 100644
--- a/tools/hciattach.c
+++ b/tools/hciattach.c
-@@ -1293,7 +1293,7 @@ int main(int argc, char *argv[])
+@@ -1227,7 +1227,7 @@ int main(int argc, char *argv[])
{
struct uart_t *u = NULL;
int detach, printpid, raw, opt, i, n, ld, err;
@@ -20,6 +22,3 @@ index 5861d33..4141796 100644
int init_speed = 0;
int send_break = 0;
pid_t pid;
---
-1.9.1
-
diff --git a/recipes-connectivity/bluez5/bluez5/0004-Move-the-43xx-firmware-into-lib-firmware.patch b/recipes-connectivity/bluez5/bluez5/0004-Move-the-43xx-firmware-into-lib-firmware.patch
deleted file mode 100644
index dadce35..0000000
--- a/recipes-connectivity/bluez5/bluez5/0004-Move-the-43xx-firmware-into-lib-firmware.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From 76681284b0ea49852041fdb97a35175089a08781 Mon Sep 17 00:00:00 2001
-From: Phil Elwell <phil@raspberrypi.org>
-Date: Tue, 23 Feb 2016 17:52:29 +0000
-Subject: [PATCH 4/4] Move the 43xx firmware into /lib/firmware
-
----
- tools/hciattach_bcm43xx.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tools/hciattach_bcm43xx.c b/tools/hciattach_bcm43xx.c
-index 0b792e0..207f668 100644
---- a/tools/hciattach_bcm43xx.c
-+++ b/tools/hciattach_bcm43xx.c
-@@ -43,7 +43,7 @@
- #include "hciattach.h"
-
- #ifndef FIRMWARE_DIR
--#define FIRMWARE_DIR "/etc/firmware"
-+#define FIRMWARE_DIR "/lib/firmware"
- #endif
-
- #define FW_EXT ".hcd"
---
-1.9.1
-
diff --git a/recipes-connectivity/bluez5/bluez5/0004-Move-the-hciattach-firmware-into-lib-firmware.patch b/recipes-connectivity/bluez5/bluez5/0004-Move-the-hciattach-firmware-into-lib-firmware.patch
new file mode 100644
index 0000000..9cf03ed
--- /dev/null
+++ b/recipes-connectivity/bluez5/bluez5/0004-Move-the-hciattach-firmware-into-lib-firmware.patch
@@ -0,0 +1,31 @@
+From 744f894e42d05b1dee917cc221ed3c1815990459 Mon Sep 17 00:00:00 2001
+From: Phil Elwell <phil@raspberrypi.org>
+Date: Tue, 23 Feb 2016 17:52:29 +0000
+Subject: [PATCH] Move the hciattach firmware into /lib/firmware
+
+* FIRMWARE_DIR is now used by all hciattach firmware (not just bcm43xx) since 5.66 with:
+ commit d9253248363b995e44c1f5e393ed1c7aa4ec81ce
+ Author: Marek Vasut <marex@denx.de>
+ Date: Tue Nov 1 12:53:33 2022 +0100
+ Subject: tools: Make hciattach_* firmware path build-time configurable
+
+Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
+---
+Upstream-Status: Pending
+
+ tools/hciattach.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/hciattach.h b/tools/hciattach.h
+index dfa4c1e7a..e88484766 100644
+--- a/tools/hciattach.h
++++ b/tools/hciattach.h
+@@ -41,7 +41,7 @@
+ #define HCI_UART_VND_DETECT 5
+
+ #ifndef FIRMWARE_DIR
+-#define FIRMWARE_DIR "/etc/firmware"
++#define FIRMWARE_DIR "/lib/firmware"
+ #endif
+
+ int read_hci_event(int fd, unsigned char *buf, int size);
diff --git a/recipes-connectivity/bluez5/bluez5_%.bbappend b/recipes-connectivity/bluez5/bluez5_%.bbappend
index d1a07a4..c5d905b 100644
--- a/recipes-connectivity/bluez5/bluez5_%.bbappend
+++ b/recipes-connectivity/bluez5/bluez5_%.bbappend
@@ -1,10 +1,10 @@
-FILESEXTRAPATHS_prepend_rpi := "${THISDIR}/${PN}:"
+FILESEXTRAPATHS:prepend:rpi := "${THISDIR}/${PN}:"
-SRC_URI_append_rpi = "\
+SRC_URI:append:rpi = "\
file://0001-bcm43xx-Add-bcm43xx-3wire-variant.patch \
file://0002-bcm43xx-The-UART-speed-must-be-reset-after-the-firmw.patch \
file://0003-Increase-firmware-load-timeout-to-30s.patch \
- file://0004-Move-the-43xx-firmware-into-lib-firmware.patch \
+ file://0004-Move-the-hciattach-firmware-into-lib-firmware.patch \
"
-RDEPENDS_${PN}_append_rpi = " pi-bluetooth"
+RDEPENDS:${PN}:append:rpi = " pi-bluetooth"
diff --git a/recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch b/recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch
index 079377e..8766a77 100644
--- a/recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch
+++ b/recipes-connectivity/pi-bluetooth/pi-bluetooth/0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch
@@ -3,7 +3,7 @@ From: "Peter A. Bigot" <pab@pabigot.com>
Date: Wed, 14 Nov 2018 09:19:51 -0600
Subject: [PATCH] bthelper: correct path for hciconfig under Yocto
-Upstream-Status: Inapproprate [OE-specific]
+Upstream-Status: Inappropriate [OE-specific]
Signed-off-by: Peter A. Bigot <pab@pabigot.com>
Signed-off-by: Andrei Gherzan <andrei@gherzan.ro>
diff --git a/recipes-connectivity/pi-bluetooth/pi-bluetooth_0.1.17.bb b/recipes-connectivity/pi-bluetooth/pi-bluetooth_0.1.17.bb
index a1eb97f..1c3daff 100644
--- a/recipes-connectivity/pi-bluetooth/pi-bluetooth_0.1.17.bb
+++ b/recipes-connectivity/pi-bluetooth/pi-bluetooth_0.1.17.bb
@@ -7,19 +7,21 @@ LIC_FILES_CHKSUM = "\
"
SRC_URI = "\
- git://github.com/RPi-Distro/pi-bluetooth \
+ git://github.com/RPi-Distro/pi-bluetooth;branch=master;protocol=https \
file://0001-bthelper-correct-path-for-hciconfig-under-Yocto.patch \
"
SRCREV = "fd4775bf90e037551532fc214a958074830bb80d"
S = "${WORKDIR}/git"
+inherit ${@bb.utils.contains('DISTRO_FEATURES', 'systemd', 'systemd', 'update-rc.d', d)}
# hciuart.service replaces what was brcm43438.service
-inherit systemd
-SYSTEMD_SERVICE_${PN} = "\
+SYSTEMD_SERVICE:${PN} = "\
hciuart.service \
bthelper@.service \
"
+INITSCRIPT_NAME = "btuart"
+INITSCRIPT_PARAMS = "start 18 2 3 4 5 ."
do_install() {
install -d ${D}${sysconfdir}/udev/rules.d
@@ -33,15 +35,23 @@ do_install() {
install -d ${D}${systemd_system_unitdir}
install -m 0644 ${S}/debian/pi-bluetooth.bthelper@.service ${D}${systemd_system_unitdir}/bthelper@.service
install -m 0644 ${S}/debian/pi-bluetooth.hciuart.service ${D}${systemd_system_unitdir}/hciuart.service
+ else
+ install -d ${D}${sysconfdir}/init.d/
+ cat > ${WORKDIR}/btuart.init << EOF
+#!/bin/sh
+/usr/bin/btuart
+EOF
+ install -m 0755 ${WORKDIR}/btuart.init ${D}${sysconfdir}/init.d/btuart
+ sed -i -e 's:TAG+="systemd".*$:RUN+="/usr/bin/bthelper %k":' ${D}${sysconfdir}/udev/rules.d/90-pi-bluetooth.rules
fi
}
-FILES_${PN} = "\
+FILES:${PN} = "\
${bindir} \
${sysconfdir} \
${systemd_unitdir}/system \
"
-RDEPENDS_${PN} += " \
+RDEPENDS:${PN} += " \
udev-rules-rpi \
"
diff --git a/recipes-core/images/rpi-basic-image.bb b/recipes-core/images/rpi-basic-image.bb
deleted file mode 100644
index 77a3d7b..0000000
--- a/recipes-core/images/rpi-basic-image.bb
+++ /dev/null
@@ -1,15 +0,0 @@
-# Base this image on core-image-minimal
-include recipes-core/images/core-image-minimal.bb
-
-# Include modules in rootfs
-IMAGE_INSTALL += " \
- kernel-modules \
- "
-
-SPLASH = "psplash-raspberrypi"
-
-IMAGE_FEATURES += "ssh-server-dropbear splash"
-
-do_image_prepend() {
- bb.warn("The image 'rpi-basic-image' is deprecated, please use 'core-image-base' instead")
-}
diff --git a/recipes-core/images/rpi-hwup-image.bb b/recipes-core/images/rpi-hwup-image.bb
deleted file mode 100644
index 86e9b6d..0000000
--- a/recipes-core/images/rpi-hwup-image.bb
+++ /dev/null
@@ -1,11 +0,0 @@
-# Base this image on core-image-minimal
-include recipes-core/images/core-image-minimal.bb
-
-# Include modules in rootfs
-IMAGE_INSTALL += " \
- kernel-modules \
- "
-
-do_image_prepend() {
- bb.warn("The image 'rpi-hwup-image' is deprecated, please use 'core-image-base' instead")
-}
diff --git a/recipes-core/images/rpi-test-image.bb b/recipes-core/images/rpi-test-image.bb
index c2f5f73..360b20d 100644
--- a/recipes-core/images/rpi-test-image.bb
+++ b/recipes-core/images/rpi-test-image.bb
@@ -3,4 +3,4 @@ include recipes-core/images/core-image-base.bb
COMPATIBLE_MACHINE = "^rpi$"
-IMAGE_INSTALL_append = " packagegroup-rpi-test"
+IMAGE_INSTALL:append = " packagegroup-rpi-test"
diff --git a/recipes-core/packagegroups/packagegroup-core-tools-testapps.bbappend b/recipes-core/packagegroups/packagegroup-core-tools-testapps.bbappend
index 500d871..1cc1dfc 100644
--- a/recipes-core/packagegroups/packagegroup-core-tools-testapps.bbappend
+++ b/recipes-core/packagegroups/packagegroup-core-tools-testapps.bbappend
@@ -1,2 +1,2 @@
# mesa-demos needs gles1 and userland driver does not have it, works ok with vc4 graphics driver
-X11GLTOOLS_remove_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mesa-demos', d)}"
+X11GLTOOLS:remove:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mesa-demos', d)}"
diff --git a/recipes-core/packagegroups/packagegroup-rpi-test.bb b/recipes-core/packagegroups/packagegroup-rpi-test.bb
index a3f4ac9..ff8f8a4 100644
--- a/recipes-core/packagegroups/packagegroup-rpi-test.bb
+++ b/recipes-core/packagegroups/packagegroup-rpi-test.bb
@@ -10,12 +10,16 @@ COMPATIBLE_MACHINE = "^rpi$"
OMXPLAYER = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'omxplayer', d)}"
-RDEPENDS_${PN} = "\
+RDEPENDS:${PN} = "\
${OMXPLAYER} \
bcm2835-tests \
+ raspi-gpio \
rpio \
rpi-gpio \
pi-blaster \
+ python3-adafruit-circuitpython-register \
+ python3-adafruit-platformdetect \
+ python3-adafruit-pureio \
python3-rtimu \
connman \
connman-client \
@@ -23,7 +27,7 @@ RDEPENDS_${PN} = "\
bluez5 \
"
-RRECOMMENDS_${PN} = "\
+RRECOMMENDS:${PN} = "\
${@bb.utils.contains("BBFILE_COLLECTIONS", "meta-multimedia", "bigbuckbunny-1080p bigbuckbunny-480p bigbuckbunny-720p", "", d)} \
${MACHINE_EXTRA_RRECOMMENDS} \
"
diff --git a/recipes-core/psplash/files/framebuf.conf b/recipes-core/psplash/files/framebuf.conf
new file mode 100644
index 0000000..44e1ded
--- /dev/null
+++ b/recipes-core/psplash/files/framebuf.conf
@@ -0,0 +1,4 @@
+[Unit]
+Requires=sys-devices-platform-gpu-graphics-fb0.device
+After=sys-devices-platform-gpu-graphics-fb0.device
+
diff --git a/recipes-core/psplash/psplash_%.bbappend b/recipes-core/psplash/psplash_%.bbappend
index 41622aa..57cade8 100644
--- a/recipes-core/psplash/psplash_%.bbappend
+++ b/recipes-core/psplash/psplash_%.bbappend
@@ -1,2 +1,12 @@
-FILESEXTRAPATHS_prepend := "${THISDIR}/files:"
-SPLASH_IMAGES_rpi = "file://psplash-raspberrypi-img.h;outsuffix=raspberrypi"
+FILESEXTRAPATHS:prepend := "${THISDIR}/files:"
+SPLASH_IMAGES:rpi = "file://psplash-raspberrypi-img.h;outsuffix=raspberrypi"
+
+SRC_URI:append:rpi = " file://framebuf.conf"
+
+do_install:append:rpi() {
+ if [ "${@bb.utils.filter('DISTRO_FEATURES', 'systemd', d)}" ]; then
+ install -Dm 0644 ${WORKDIR}/framebuf.conf ${D}${systemd_system_unitdir}/psplash-start.service.d/framebuf.conf
+ fi
+}
+
+FILES:${PN}:append:rpi = " ${systemd_system_unitdir}/psplash-start.service.d"
diff --git a/recipes-core/udev/udev-rules-rpi.bb b/recipes-core/udev/udev-rules-rpi.bb
index 42cfcdd..3ae4385 100644
--- a/recipes-core/udev/udev-rules-rpi.bb
+++ b/recipes-core/udev/udev-rules-rpi.bb
@@ -3,16 +3,17 @@ LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://${COMMON_LICENSE_DIR}/MIT;md5=0835ade698e0bcf8506ecda2f7b4f302"
SRC_URI = " \
- file://99-com.rules \
+ git://github.com/RPi-Distro/raspberrypi-sys-mods;protocol=https;branch=master \
file://can.rules \
"
+SRCREV = "5ce3ef2b7f377c23fea440ca9df0e30f3f8447cf"
-S = "${WORKDIR}"
+S = "${WORKDIR}/git"
INHIBIT_DEFAULT_DEPS = "1"
do_install () {
install -d ${D}${sysconfdir}/udev/rules.d
- install -m 0644 ${WORKDIR}/99-com.rules ${D}${sysconfdir}/udev/rules.d/
+ install -m 0644 ${S}/etc.armhf/udev/rules.d/99-com.rules ${D}${sysconfdir}/udev/rules.d/
install -m 0644 ${WORKDIR}/can.rules ${D}${sysconfdir}/udev/rules.d/
}
diff --git a/recipes-core/udev/udev-rules-rpi/99-com.rules b/recipes-core/udev/udev-rules-rpi/99-com.rules
deleted file mode 100644
index ddd1e17..0000000
--- a/recipes-core/udev/udev-rules-rpi/99-com.rules
+++ /dev/null
@@ -1,21 +0,0 @@
-KERNEL=="ttyAMA[01]", PROGRAM="/bin/sh -c '\
- ALIASES=/proc/device-tree/aliases; \
- if cmp -s $$ALIASES/uart0 $$ALIASES/serial0; then \
- echo 0;\
- elif cmp -s $$ALIASES/uart0 $$ALIASES/serial1; then \
- echo 1; \
- else \
- exit 1; \
- fi\
-'", SYMLINK+="serial%c"
-
-KERNEL=="ttyS0", PROGRAM="/bin/sh -c '\
- ALIASES=/proc/device-tree/aliases; \
- if cmp -s $$ALIASES/uart1 $$ALIASES/serial0; then \
- echo 0; \
- elif cmp -s $$ALIASES/uart1 $$ALIASES/serial1; then \
- echo 1; \
- else \
- exit 1; \
- fi \
-'", SYMLINK+="serial%c"
diff --git a/recipes-core/udev/udev-rules-udisks-rpi_1.0.bb b/recipes-core/udev/udev-rules-udisks-rpi_1.0.bb
index ae35521..d5d4589 100644
--- a/recipes-core/udev/udev-rules-udisks-rpi_1.0.bb
+++ b/recipes-core/udev/udev-rules-udisks-rpi_1.0.bb
@@ -10,4 +10,4 @@ do_install () {
install -m 644 ${WORKDIR}/80-udisks-rpi.rules ${D}${base_libdir}/udev/rules.d
}
-FILES_${PN} = "${base_libdir}/udev/rules.d"
+FILES:${PN} = "${base_libdir}/udev/rules.d"
diff --git a/recipes-devtools/bcm2835/bcm2835_1.52.bb b/recipes-devtools/bcm2835/bcm2835_1.52.bb
deleted file mode 100644
index eef6afd..0000000
--- a/recipes-devtools/bcm2835/bcm2835_1.52.bb
+++ /dev/null
@@ -1,42 +0,0 @@
-DESCRIPTION = "Package that provides access to GPIO and other IO\
-functions on the Broadcom BCM 2835 chip, allowing access to the\
-GPIO pins on the 26 pin IDE plug on the RPi board"
-SECTION = "base"
-HOMEPAGE = "http://www.open.com.au/mikem/bcm2835"
-AUTHOR = "Mike McCauley (mikem@open.com.au)"
-
-LICENSE = "GPLv2"
-LIC_FILES_CHKSUM = "file://COPYING;md5=b234ee4d69f5fce4486a80fdaf4a4263"
-
-COMPATIBLE_MACHINE = "^rpi$"
-
-SRC_URI = "http://www.airspayce.com/mikem/bcm2835/bcm2835-${PV}.tar.gz"
-
-SRC_URI[md5sum] = "b5dc426b4ff258bb1397442f98e40236"
-SRC_URI[sha256sum] = "b9fd10f7a80aadaed28a77168709b7c519568a63b6e98d0a50e9c5fe31bea6bb"
-
-inherit autotools
-
-do_compile_append() {
- # Now compiling the examples provided by the package
- mkdir -p ${B}/examples
- for file in `ls ${S}/examples`; do
- ${CC} ${LDFLAGS} ${S}/examples/${file}/${file}.c -o ${B}/examples/${file} -Bstatic -L${B}/src -lbcm2835 -I${S}/src
- done
-}
-
-do_install_append() {
- install -d ${D}/${libdir}/${BPN}
- for file in ${B}/examples/*
- do
- install -m 0755 ${file} ${D}/${libdir}/${BPN}
- done
-}
-
-PACKAGES += "${PN}-tests"
-
-RDEPENDS_${PN}-dev = ""
-
-FILES_${PN} = ""
-FILES_${PN}-tests = "${libdir}/${BPN}"
-FILES_${PN}-dbg += "${libdir}/${BPN}/.debug"
diff --git a/recipes-devtools/bcm2835/bcm2835_1.73.bb b/recipes-devtools/bcm2835/bcm2835_1.73.bb
new file mode 100644
index 0000000..cdf2332
--- /dev/null
+++ b/recipes-devtools/bcm2835/bcm2835_1.73.bb
@@ -0,0 +1,49 @@
+DESCRIPTION = "Package that provides access to GPIO and other IO\
+functions on the Broadcom BCM 2835 chip, allowing access to the\
+GPIO pins on the 26 pin IDE plug on the RPi board"
+SECTION = "base"
+HOMEPAGE = "http://www.open.com.au/mikem/bcm2835"
+AUTHOR = "Mike McCauley (mikem@open.com.au)"
+
+LICENSE = "GPL-3.0-only"
+LIC_FILES_CHKSUM = "file://COPYING;md5=e49f4652534af377a713df3d9dec60cb"
+
+COMPATIBLE_MACHINE = "^rpi$"
+
+SRC_URI = "http://www.airspayce.com/mikem/bcm2835/bcm2835-${PV}.tar.gz"
+
+SRC_URI[sha256sum] = "e67a986462618988a5a86752e36e3ebdd7c5cae66940ff7330aea243b2762525"
+
+inherit autotools
+
+do_compile:append() {
+ # Now compiling the examples provided by the package
+ mkdir -p ${B}/examples/spiram
+ for file in `ls ${S}/examples`; do
+ example="$file"
+ if [ "$file" = "spiram" ]; then
+ # This includes a tiny library
+ EXAMPLE_LDFLAGS="-L${B}/examples/spiram -lspiram"
+ example="spiram_test"
+ ${CC} ${CFLAGS} -c ${S}/examples/spiram/spiram.c -o ${B}/examples/spiram/libspiram.o -I${S}/src -I${S}/examples/spiram
+ rm -f ${B}/examples/spiram/libspiram.a && ${BUILD_AR} crD ${B}/examples/spiram/libspiram.a ${B}/examples/spiram/libspiram.o
+ fi
+ ${CC} ${LDFLAGS} ${S}/examples/${file}/${example}.c -o ${B}/examples/${example} -Bstatic -L${B}/src -lbcm2835 ${EXAMPLE_LDFLAGS} -I${S}/src
+ done
+}
+
+do_install:append() {
+ install -d ${D}/${libdir}/${BPN}
+ for example in $(find ${B}/examples -type f -maxdepth 1)
+ do
+ install -m 0755 ${example} ${D}/${libdir}/${BPN}
+ done
+}
+
+PACKAGES += "${PN}-tests"
+
+RDEPENDS:${PN}-dev = ""
+
+FILES:${PN} = ""
+FILES:${PN}-tests = "${libdir}/${BPN}"
+FILES:${PN}-dbg += "${libdir}/${BPN}/.debug"
diff --git a/recipes-devtools/pi-blaster/pi-blaster_git.bb b/recipes-devtools/pi-blaster/pi-blaster_git.bb
index fdaf16c..d02fa92 100644
--- a/recipes-devtools/pi-blaster/pi-blaster_git.bb
+++ b/recipes-devtools/pi-blaster/pi-blaster_git.bb
@@ -2,21 +2,21 @@ DESCRIPTION = "This project enables PWM on the GPIO pins you request of a Raspbe
HOMEPAGE = "https://github.com/sarfata/pi-blaster/"
SECTION = "devel/libs"
LICENSE = "MIT"
-LIC_FILES_CHKSUM = "file://README.md;beginline=268;endline=292;md5=86d10e4bcf4b4014d306dde7c1d2a80d"
+LIC_FILES_CHKSUM = "file://README.md;beginline=295;endline=319;md5=86d10e4bcf4b4014d306dde7c1d2a80d"
-SRC_URI = "git://github.com/sarfata/pi-blaster \
+SRC_URI = "git://github.com/sarfata/pi-blaster;branch=master;protocol=https \
file://remove-initscript-lsb-dependency.patch \
"
S = "${WORKDIR}/git"
-SRCREV = "befd8ef36e5066e4d444ef47fe4020787e541248"
+SRCREV = "fbba9a7dcef0f352a11f8a2a5f6cbc15b62c0829"
inherit update-rc.d autotools
INITSCRIPT_PACKAGES = "${PN}"
-INITSCRIPT_NAME_${PN} = "${PN}.boot.sh"
-INITSCRIPT_PARAMS_${PN} = "defaults 15 85"
+INITSCRIPT_NAME:${PN} = "${PN}.boot.sh"
+INITSCRIPT_PARAMS:${PN} = "defaults 15 85"
COMPATIBLE_MACHINE = "^rpi$"
diff --git a/recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.4.bb b/recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.10.bb
index f1af80b..8ff3073 100644
--- a/recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.4.bb
+++ b/recipes-devtools/python/python3-adafruit-circuitpython-register_1.9.10.bb
@@ -3,13 +3,12 @@ HOMEPAGE = "https://github.com/adafruit/Adafruit_CircuitPython_Register"
LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=6ec69d6e9e6c85adfb7799d7f8cf044e"
-SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_Register.git"
-
+SRC_URI = "git://github.com/adafruit/Adafruit_CircuitPython_Register.git;branch=main;protocol=https"
+SRCREV = "d1e8ac7ad9dcd65ab83749db3e5c96ffee80ebb7"
S = "${WORKDIR}/git"
-SRCREV = "5fee6e0c3878110844bc51e16063eeae7d94c457"
DEPENDS += "python3-setuptools-scm-native"
inherit setuptools3
-RDEPENDS_${PN} += "python3-core"
+RDEPENDS:${PN} += "python3-core"
diff --git a/recipes-devtools/python/python3-adafruit-platformdetect_3.1.1.bb b/recipes-devtools/python/python3-adafruit-platformdetect_3.27.0.bb
index 4454d24..45dc49d 100644
--- a/recipes-devtools/python/python3-adafruit-platformdetect_3.1.1.bb
+++ b/recipes-devtools/python/python3-adafruit-platformdetect_3.27.0.bb
@@ -3,13 +3,12 @@ HOMEPAGE = "https://github.com/adafruit/Adafruit_Python_PlatformDetect"
LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=fccd531dce4b989c05173925f0bbb76c"
-SRC_URI = "git://github.com/adafruit/Adafruit_Python_PlatformDetect.git"
-SRCREV = "e0fe1b012898fa824944d6805ca74be0fa027968"
-
+SRC_URI = "git://github.com/adafruit/Adafruit_Python_PlatformDetect.git;branch=main;protocol=https"
+SRCREV = "e1460098eeca5ea573f92814691bb378e15530d9"
S = "${WORKDIR}/git"
inherit setuptools3
DEPENDS += "python3-setuptools-scm-native"
-RDEPENDS_${PN} += "python3-core"
+RDEPENDS:${PN} += "python3-core"
diff --git a/recipes-devtools/python/python3-adafruit-pureio_1.1.8.bb b/recipes-devtools/python/python3-adafruit-pureio_1.1.9.bb
index 82415f9..cdbe4b4 100644
--- a/recipes-devtools/python/python3-adafruit-pureio_1.1.8.bb
+++ b/recipes-devtools/python/python3-adafruit-pureio_1.1.9.bb
@@ -3,8 +3,8 @@ HOMEPAGE = "https://github.com/adafruit/Adafruit_Python_PureIO"
LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=2a21fcca821a506d4c36f7bbecc0d009"
-SRC_URI = "git://github.com/adafruit/Adafruit_Python_PureIO.git"
-SRCREV = "f4d0973da05b8b21905ff6bab69cdb652128f342"
+SRC_URI = "git://github.com/adafruit/Adafruit_Python_PureIO.git;branch=main;protocol=https"
+SRCREV = "383b615ce9ff5bbefdf77652799f380016fda353"
S = "${WORKDIR}/git"
@@ -12,7 +12,7 @@ inherit setuptools3
DEPENDS += "python3-setuptools-scm-native"
-RDEPENDS_${PN} += " \
+RDEPENDS:${PN} += " \
python3-core \
python3-ctypes \
python3-fcntl \
diff --git a/recipes-devtools/python/python3-rtimu/0001-setup.py-Port-to-use-setuptools.patch b/recipes-devtools/python/python3-rtimu/0001-setup.py-Port-to-use-setuptools.patch
new file mode 100644
index 0000000..77eca99
--- /dev/null
+++ b/recipes-devtools/python/python3-rtimu/0001-setup.py-Port-to-use-setuptools.patch
@@ -0,0 +1,29 @@
+From f5ab30abd37ee884fb3ccaad0a8d21108ca2c812 Mon Sep 17 00:00:00 2001
+From: Khem Raj <raj.khem@gmail.com>
+Date: Mon, 28 Feb 2022 21:37:19 -0800
+Subject: [PATCH] setup.py: Port to use setuptools
+
+Needed to get it going with wheel, distutils is deprecated for long
+
+Upstream-Status: Pending
+Signed-off-by: Khem Raj <raj.khem@gmail.com>
+---
+ Linux/python/setup.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/Linux/python/setup.py b/Linux/python/setup.py
+index e429e6f..da96843 100644
+--- a/Linux/python/setup.py
++++ b/Linux/python/setup.py
+@@ -22,7 +22,7 @@
+ #// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ #// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+-from distutils.core import setup, Extension
++from setuptools import setup, Extension
+ import os.path
+
+ RTIMU_sources = [
+--
+2.35.1
+
diff --git a/recipes-devtools/python/python3-rtimu_git.bb b/recipes-devtools/python/python3-rtimu_7.2.1.bb
index 4f13a38..1afbb5f 100644
--- a/recipes-devtools/python/python3-rtimu_git.bb
+++ b/recipes-devtools/python/python3-rtimu_7.2.1.bb
@@ -5,11 +5,11 @@ SECTION = "devel/python"
LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://../../LICENSE;md5=96cdecb41125f498958e09b72faf318e"
-SRC_URI = "git://github.com/RPi-Distro/RTIMULib.git;protocol=http;branch=master \
+SRC_URI = "git://github.com/RPi-Distro/RTIMULib.git;protocol=http;branch=master;protocol=https \
file://0001-include-asm-ioctl.h-for-ioctl-define.patch;patchdir=../.. \
+ file://0001-setup.py-Port-to-use-setuptools.patch;patchdir=../.. \
"
SRCREV = "b949681af69b45f0f7f4bb53b6770037b5b02178"
S = "${WORKDIR}/git/Linux/python"
-
inherit setuptools3
diff --git a/recipes-devtools/python/rpi-gpio_0.7.0.bb b/recipes-devtools/python/rpi-gpio_0.7.0.bb
deleted file mode 100644
index 8b13d53..0000000
--- a/recipes-devtools/python/rpi-gpio_0.7.0.bb
+++ /dev/null
@@ -1,18 +0,0 @@
-DESCRIPTION = "A module to control Raspberry Pi GPIO channels"
-HOMEPAGE = "https://sourceforge.net/projects/raspberry-gpio-python/"
-SECTION = "devel/python"
-LICENSE = "MIT"
-LIC_FILES_CHKSUM = "file://LICENCE.txt;md5=9b95630a648966b142f1a0dcea001cb7"
-
-PYPI_PACKAGE = "RPi.GPIO"
-inherit pypi distutils3
-
-SRC_URI += "file://0001-Remove-nested-functions.patch"
-SRC_URI[md5sum] = "777617f9dea9a1680f9af43db0cf150e"
-SRC_URI[sha256sum] = "7424bc6c205466764f30f666c18187a0824077daf20b295c42f08aea2cb87d3f"
-
-COMPATIBLE_MACHINE = "^rpi$"
-
-# ignore issues with -fno-common from gcc-10 until it's fixed in upstream:
-# https://sourceforge.net/p/raspberry-gpio-python/tickets/187/
-CFLAGS += "-fcommon"
diff --git a/recipes-devtools/python/rpi-gpio_0.7.1.bb b/recipes-devtools/python/rpi-gpio_0.7.1.bb
new file mode 100644
index 0000000..e7a9950
--- /dev/null
+++ b/recipes-devtools/python/rpi-gpio_0.7.1.bb
@@ -0,0 +1,15 @@
+DESCRIPTION = "A module to control Raspberry Pi GPIO channels"
+HOMEPAGE = "https://sourceforge.net/projects/raspberry-gpio-python/"
+SECTION = "devel/python"
+LICENSE = "MIT"
+LIC_FILES_CHKSUM = "file://LICENCE.txt;md5=a2294b0b1daabc30dfb5b3de73b2e00a"
+
+PYPI_PACKAGE = "RPi.GPIO"
+
+inherit pypi setuptools3
+
+SRC_URI += "file://0001-Remove-nested-functions.patch \
+ "
+SRC_URI[sha256sum] = "cd61c4b03c37b62bba4a5acfea9862749c33c618e0295e7e90aa4713fb373b70"
+
+COMPATIBLE_MACHINE = "^rpi$"
diff --git a/recipes-devtools/python/rpio/0001-include-sys-types.h-explicitly-for-getting-caddr_t-d.patch b/recipes-devtools/python/rpio/0001-include-sys-types.h-explicitly-for-getting-caddr_t-d.patch
deleted file mode 100644
index bed9749..0000000
--- a/recipes-devtools/python/rpio/0001-include-sys-types.h-explicitly-for-getting-caddr_t-d.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-From c86bfacc98d58244f532626954ed00d84ecfa82d Mon Sep 17 00:00:00 2001
-From: Khem Raj <raj.khem@gmail.com>
-Date: Sat, 30 Jan 2016 17:12:37 -0800
-Subject: [PATCH] include sys/types.h explicitly for getting caddr_t definition
-
-Helps fixing build on musl where sys/types.h is not included indirectly
-as happening on glibc
-
-Signed-off-by: Khem Raj <raj.khem@gmail.com>
----
-Upstream-Status: Submitted
-
- source/c_gpio/c_gpio.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/source/c_gpio/c_gpio.c b/source/c_gpio/c_gpio.c
-index 25a04ca..70df632 100644
---- a/source/c_gpio/c_gpio.c
-+++ b/source/c_gpio/c_gpio.c
-@@ -29,6 +29,7 @@
- #include <stdint.h>
- #include <stdlib.h>
- #include <fcntl.h>
-+#include <sys/types.h>
- #include <sys/mman.h>
- #include "c_gpio.h"
-
---
-2.7.0
-
diff --git a/recipes-devtools/python/rpio_0.10.0.bb b/recipes-devtools/python/rpio_0.10.1.bb
index d5653cb..176646d 100644
--- a/recipes-devtools/python/rpio_0.10.0.bb
+++ b/recipes-devtools/python/rpio_0.10.1.bb
@@ -2,19 +2,20 @@ DESCRIPTION = "Advanced GPIO for the Raspberry Pi. Extends RPi.GPIO with PWM, \
GPIO interrups, TCP socket interrupts, command line tools and more"
HOMEPAGE = "https://github.com/metachris/RPIO"
SECTION = "devel/python"
-LICENSE = "LGPLv3+"
-LIC_FILES_CHKSUM = "file://README.rst;beginline=41;endline=53;md5=d5d95d7486a4d98c999675c23196b25a"
+LICENSE = "LGPL-3.0-or-later"
+LIC_FILES_CHKSUM = "file://LICENSE.txt;md5=bb3ca60759f3202f1ae42e3519cd06bc"
-PYPI_PACKAGE = "RPIO"
-inherit pypi
-
-SRC_URI += "file://0001-include-sys-types.h-explicitly-for-getting-caddr_t-d.patch"
+SRC_URI = "\
+ git://github.com/metachris/RPIO.git;protocol=https;branch=master \
+ "
+SRCREV = "be1942a69b2592ddacd9dc833d2668a19aafd8d2"
+S = "${WORKDIR}/git"
inherit setuptools3
COMPATIBLE_MACHINE = "^rpi$"
-RDEPENDS_${PN} = "\
+RDEPENDS:${PN} = "\
python3-logging \
python3-threading \
"
diff --git a/recipes-devtools/raspi-gpio/raspi-gpio_git.bb b/recipes-devtools/raspi-gpio/raspi-gpio_git.bb
index 02a3f55..8ea4be1 100644
--- a/recipes-devtools/raspi-gpio/raspi-gpio_git.bb
+++ b/recipes-devtools/raspi-gpio/raspi-gpio_git.bb
@@ -8,7 +8,7 @@ COMPATIBLE_MACHINE = "^rpi$"
inherit autotools
-SRCREV = "4edfde183ff3ac9ed66cdc015ae25e45f3a5502d"
+SRCREV = "22b44e4765b4b78dc5b22394fff484e353d5914d"
SRC_URI = "git://github.com/RPi-Distro/raspi-gpio.git;protocol=https;branch=master \
"
diff --git a/recipes-graphics/cairo/cairo_%.bbappend b/recipes-graphics/cairo/cairo_%.bbappend
index 7ba13e2..5efd8fc 100644
--- a/recipes-graphics/cairo/cairo_%.bbappend
+++ b/recipes-graphics/cairo/cairo_%.bbappend
@@ -1,3 +1,3 @@
PACKAGECONFIG_GLESV2 = " ${@bb.utils.contains('DISTRO_FEATURES', 'x11', '', 'glesv2', d)}"
-PACKAGECONFIG_append_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' egl ${PACKAGECONFIG_GLESV2}', d)}"
+PACKAGECONFIG:append:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' egl ${PACKAGECONFIG_GLESV2}', d)}"
diff --git a/recipes-graphics/kmscube/kmscube_%.bbappend b/recipes-graphics/kmscube/kmscube_%.bbappend
index 9343fb0..f9c23c9 100644
--- a/recipes-graphics/kmscube/kmscube_%.bbappend
+++ b/recipes-graphics/kmscube/kmscube_%.bbappend
@@ -1,2 +1,2 @@
# userland driver EGL implementation does not have all needed bits for it so remove it from build
-COMPATIBLE_HOST_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '(.*)', 'null', d)}"
+COMPATIBLE_HOST:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '(.*)', 'null', d)}"
diff --git a/recipes-graphics/libsdl2/libsdl2_%.bbappend b/recipes-graphics/libsdl2/libsdl2_%.bbappend
index cb9c24e..28a66bc 100644
--- a/recipes-graphics/libsdl2/libsdl2_%.bbappend
+++ b/recipes-graphics/libsdl2/libsdl2_%.bbappend
@@ -1,5 +1,5 @@
-FILESEXTRAPATHS_prepend := "${THISDIR}/${PN}:"
+FILESEXTRAPATHS:prepend := "${THISDIR}/${PN}:"
# when using userland graphic KHR/khrplatform.h is provided by userland but virtual/libgl is provided by mesa-gl where
# we explicitly delete KHR/khrplatform.h since its already coming from userland package
-DEPENDS_append_rpi = " ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'userland', d)}"
+DEPENDS:append:rpi = " ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'userland', d)}"
diff --git a/recipes-graphics/libva/libva_%.bbappend b/recipes-graphics/libva/libva_%.bbappend
index 56ff421..ebfc519 100644
--- a/recipes-graphics/libva/libva_%.bbappend
+++ b/recipes-graphics/libva/libva_%.bbappend
@@ -1,3 +1,3 @@
# when using userland graphic KHR/khrplatform.h is provided by userland but virtual/libgl is provided by mesa-gl where
# we explicitly delete KHR/khrplatform.h since its already coming from userland package
-DEPENDS_append_rpi = " ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'userland', d)}"
+DEPENDS:append:rpi = " ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'userland', d)}"
diff --git a/recipes-graphics/mesa/libglu_%.bbappend b/recipes-graphics/mesa/libglu_%.bbappend
index 56ff421..ebfc519 100644
--- a/recipes-graphics/mesa/libglu_%.bbappend
+++ b/recipes-graphics/mesa/libglu_%.bbappend
@@ -1,3 +1,3 @@
# when using userland graphic KHR/khrplatform.h is provided by userland but virtual/libgl is provided by mesa-gl where
# we explicitly delete KHR/khrplatform.h since its already coming from userland package
-DEPENDS_append_rpi = " ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'userland', d)}"
+DEPENDS:append:rpi = " ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'userland', d)}"
diff --git a/recipes-graphics/mesa/mesa-demos_%.bbappend b/recipes-graphics/mesa/mesa-demos_%.bbappend
index c187ab9..efcaf06 100644
--- a/recipes-graphics/mesa/mesa-demos_%.bbappend
+++ b/recipes-graphics/mesa/mesa-demos_%.bbappend
@@ -1,2 +1,3 @@
-# mesa-demos need libgles1 and userland driver does not have it
-COMPATIBLE_HOST_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '(.*)', 'null', d)}"
+# mesa-demos userland driver doesn't provide libgles1 and the EGL headers it provides break the mesa-demos build.
+# And enabling the `wayland` option without enabling `egl` is useless.
+PACKAGECONFIG:remove:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'egl gles1 wayland', d)}"
diff --git a/recipes-graphics/mesa/mesa-gl_%.bbappend b/recipes-graphics/mesa/mesa-gl_%.bbappend
index e561a1b..edb75f5 100644
--- a/recipes-graphics/mesa/mesa-gl_%.bbappend
+++ b/recipes-graphics/mesa/mesa-gl_%.bbappend
@@ -1,8 +1,8 @@
-PACKAGECONFIG_append_rpi = " gbm"
-PROVIDES_append_rpi = " virtual/libgbm"
+PACKAGECONFIG:append:rpi = " gbm"
+PROVIDES:append:rpi = " virtual/libgbm"
-GALLIUMDRIVERS_append_rpi = ",swrast"
+GALLIUMDRIVERS:append:rpi = ",swrast"
-do_install_append_rpi() {
+do_install:append:rpi() {
rm -rf ${D}${includedir}/KHR/khrplatform.h
}
diff --git a/recipes-graphics/mesa/mesa_%.bbappend b/recipes-graphics/mesa/mesa_%.bbappend
index eaa46f2..7000ead 100644
--- a/recipes-graphics/mesa/mesa_%.bbappend
+++ b/recipes-graphics/mesa/mesa_%.bbappend
@@ -2,5 +2,5 @@
# With oe-core commit 8509e2e1a87578882b71948ccef3b50ccf1228b3 dri3 is set
# as default. To state out clearly that Raspi needs dri3 and to avoid surprises
# in case oe-core changes this default, we set dri3 explicitly.
-PACKAGECONFIG_append_rpi = " gallium vc4 v3d kmsro ${@bb.utils.contains('DISTRO_FEATURES', 'x11 opengl', 'x11 dri3', '', d)}"
-DRIDRIVERS_class-target_rpi = ""
+PACKAGECONFIG:append:rpi = " gallium vc4 v3d kmsro ${@bb.utils.contains('DISTRO_FEATURES', 'x11 opengl', 'x11 dri3', '', d)} ${@bb.utils.contains('DISTRO_FEATURES', 'vulkan', 'vulkan broadcom', '', d)}"
+DRIDRIVERS:class-target:rpi = ""
diff --git a/recipes-graphics/piglit/piglit_%.bbappend b/recipes-graphics/piglit/piglit_%.bbappend
index 0503fd0..8dfce27 100644
--- a/recipes-graphics/piglit/piglit_%.bbappend
+++ b/recipes-graphics/piglit/piglit_%.bbappend
@@ -1,5 +1,5 @@
# mesa-demos need libgles1 and userland driver does not have it so remove it from piglit rdeps
-RDEPENDS_${PN}_remove_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mesa-demos', d)}"
+RDEPENDS:${PN}:remove:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mesa-demos', d)}"
# it needs EGL >= 11 but userland says it provided version 10, remove it from build
# | -- Requested 'egl >= 11.0' but version of EGL is 10
-COMPATIBLE_HOST_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '(.*)', 'null', d)}"
+COMPATIBLE_HOST:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '(.*)', 'null', d)}"
diff --git a/recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch b/recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch
index de9d5c3..076ba7e 100644
--- a/recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0001-gitignore-add-archives-from-lib-directory.patch
@@ -5,7 +5,7 @@ Subject: [PATCH] gitignore: add archives from lib directory
The build creates two *.a files in the lib directory, add these to .gitignore.
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
.gitignore | 1 +
diff --git a/recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch b/recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch
index c02a767..cce94a7 100644
--- a/recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0002-add-install-targets-to-Makefiles.patch
@@ -3,7 +3,7 @@ From: Trevor Woerner <twoerner@gmail.com>
Date: Fri, 4 Dec 2020 01:54:37 -0500
Subject: [PATCH] add "install" targets to Makefiles
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
Makefile | 3 +++
diff --git a/recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch b/recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch
index 7adb12b..44ed9c3 100644
--- a/recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0003-switch-to-pkg-config.patch
@@ -10,7 +10,7 @@ I get a build error saying:
Therefore switch to the more common and more generic "pkg-config" instead of
using a libpng-specific tool for flags and libraries.
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
game/Makefile | 4 ++--
diff --git a/recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch b/recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch
index 908be62..aa83110 100644
--- a/recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0004-add-libvchostif-to-link.patch
@@ -9,7 +9,7 @@ I end up with link errors of the type:
Which is caused by not having -lvchostif in the link.
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
game/Makefile | 2 +-
diff --git a/recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch b/recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch
index ceefd03..914ffb3 100644
--- a/recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0005-change-library-linking-order.patch
@@ -10,7 +10,7 @@ linking so that it succeeds. Otherwise I get errors like the following:
...as well as undefined references to various other libpng objects.
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
game/Makefile | 2 +-
diff --git a/recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch b/recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch
index dae847d..6d2de6c 100644
--- a/recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0006-game-Makefile-install-sample-png-files.patch
@@ -3,7 +3,7 @@ From: Trevor Woerner <twoerner@gmail.com>
Date: Fri, 4 Dec 2020 03:47:17 -0500
Subject: [PATCH] game/Makefile: install sample png files
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
game/Makefile | 2 ++
diff --git a/recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch b/recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch
index b5c743e..e466a05 100644
--- a/recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch
+++ b/recipes-graphics/raspidmx/raspidmx/0007-Makefile-reorganize.patch
@@ -16,7 +16,7 @@ To build simply invoke 'make' with or without a -j option.
To install simply invoke: make TARGET=install
To clean simply invoke: make TARGET=clean
-Upstream-status: submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
+Upstream-Status: Submitted [https://github.com/AndrewFromMelbourne/raspidmx/pull/29]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
Makefile | 19 +++++++------------
diff --git a/recipes-graphics/raspidmx/raspidmx_git.bb b/recipes-graphics/raspidmx/raspidmx_git.bb
index 71590e3..4729e8c 100644
--- a/recipes-graphics/raspidmx/raspidmx_git.bb
+++ b/recipes-graphics/raspidmx/raspidmx_git.bb
@@ -5,9 +5,9 @@ LICENSE = "MIT"
LIC_FILES_CHKSUM = "file://LICENSE;md5=52962875ab02c36df6cde47b1f463024"
COMPATIBLE_HOST = "null"
-COMPATIBLE_HOST_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', 'null', '(.*)', d)}"
+COMPATIBLE_HOST:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', 'null', '(.*)', d)}"
-SRC_URI = "git://github.com/AndrewFromMelbourne/raspidmx;protocol=https \
+SRC_URI = "git://github.com/AndrewFromMelbourne/raspidmx;protocol=https;branch=master \
file://0001-gitignore-add-archives-from-lib-directory.patch \
file://0002-add-install-targets-to-Makefiles.patch \
file://0003-switch-to-pkg-config.patch \
diff --git a/recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch b/recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch
index 295309c..63f6a81 100644
--- a/recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch
+++ b/recipes-graphics/userland/files/0001-Allow-applications-to-set-next-resource-handle.patch
@@ -7,6 +7,8 @@ This patch adds provisions in userland to
let apps callers set the next rendereing dispmanx resource.
It's useful for implementing, say, a buffer carousel.
---
+Upstream-Status: Pending
+
interface/khronos/common/khrn_client_rpc.h | 2 ++
interface/khronos/common/khrn_int_ids.h | 2 ++
interface/khronos/egl/egl_client.c | 30 +++++++++++++++++++---
diff --git a/recipes-graphics/userland/files/0001-mmal-Do-not-use-Werror.patch b/recipes-graphics/userland/files/0001-mmal-Do-not-use-Werror.patch
new file mode 100644
index 0000000..ff00d8b
--- /dev/null
+++ b/recipes-graphics/userland/files/0001-mmal-Do-not-use-Werror.patch
@@ -0,0 +1,33 @@
+From 15fbe266af3dcc5b7660397204b06d04364a953a Mon Sep 17 00:00:00 2001
+From: Khem Raj <raj.khem@gmail.com>
+Date: Sat, 2 Apr 2022 21:37:42 -0700
+Subject: [PATCH] mmal: Do not use -Werror
+
+Clang warns about unused-but-set variables
+interface/mmal/vc/mmal_vc_api.c:395:18: error: variable 'status' set but not used [-We
+rror,-Wunused-but-set-variable]
+| MMAL_STATUS_T status;
+| ^
+
+Upstream-Status: Pending
+Signed-off-by: Khem Raj <raj.khem@gmail.com>
+---
+ interface/mmal/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/interface/mmal/CMakeLists.txt b/interface/mmal/CMakeLists.txt
+index 46f149d..c5c1642 100644
+--- a/interface/mmal/CMakeLists.txt
++++ b/interface/mmal/CMakeLists.txt
+@@ -3,7 +3,7 @@ if (NOT DEFINED LIBRARY_TYPE)
+ set(LIBRARY_TYPE SHARED)
+ endif (NOT DEFINED LIBRARY_TYPE)
+
+-add_definitions(-Wall -Werror)
++add_definitions(-Wall)
+
+ add_library(mmal SHARED util/mmal_util.c)
+
+--
+2.35.1
+
diff --git a/recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch b/recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch
index 7945bff..1a9a51c 100644
--- a/recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch
+++ b/recipes-graphics/userland/files/0002-wayland-Add-support-for-the-Wayland-winsys.patch
@@ -19,6 +19,8 @@ vc_vchi_dispmanx.h
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
.gitignore | 1 +
CMakeLists.txt | 11 +
README.md | 4 +
diff --git a/recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch b/recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch
index e10f9ab..a9da68a 100644
--- a/recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch
+++ b/recipes-graphics/userland/files/0003-wayland-Add-Wayland-example.patch
@@ -4,6 +4,8 @@ Date: Tue, 1 Oct 2013 13:19:20 +0200
Subject: [PATCH] wayland: Add Wayland example
---
+Upstream-Status: Pending
+
.../linux/apps/hello_pi/CMakeLists.txt | 1 +
.../linux/apps/hello_pi/Makefile | 2 +
.../hello_pi/hello_wayland/CMakeLists.txt | 8 +
diff --git a/recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch b/recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch
index 19608be..5476f41 100644
--- a/recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch
+++ b/recipes-graphics/userland/files/0004-wayland-egl-Add-bcm_host-to-dependencies.patch
@@ -9,6 +9,8 @@ lets add the dependency on bcm_host module which should do it
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/wayland-egl/wayland-egl.pc.in | 1 +
1 file changed, 1 insertion(+)
diff --git a/recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch b/recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch
index 2772323..8119a8c 100644
--- a/recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch
+++ b/recipes-graphics/userland/files/0005-interface-remove-faulty-assert-to-make-weston-happy-.patch
@@ -9,6 +9,8 @@ This was removed after a discussion on IRC with the weston guys
Signed-off-by: "Yann E. MORIN" <yann.morin.1998@free.fr>
---
+Upstream-Status: Pending
+
interface/vmcs_host/vc_vchi_dispmanx.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch b/recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch
index 5a1d8cf..8c37419 100644
--- a/recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch
+++ b/recipes-graphics/userland/files/0006-zero-out-wl-buffers-in-egl_surface_free.patch
@@ -7,6 +7,8 @@ origins from buildroot
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/egl/egl_client_surface.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch b/recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch
index bae39e1..1e90126 100644
--- a/recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch
+++ b/recipes-graphics/userland/files/0007-initialize-front-back-wayland-buffers.patch
@@ -7,6 +7,8 @@ origins from metrological wayland support
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/egl/egl_client_surface.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch b/recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch
index 1c15009..9e496c7 100644
--- a/recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch
+++ b/recipes-graphics/userland/files/0008-Remove-RPC_FLUSH.patch
@@ -7,6 +7,8 @@ Origins from buildroot
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/ext/gl_oes_egl_image_client.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch b/recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch
index 7d28453..9d8355a 100644
--- a/recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch
+++ b/recipes-graphics/userland/files/0009-fix-cmake-dependency-race.patch
@@ -17,6 +17,8 @@ make[2]: ***
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/vcos/pthreads/CMakeLists.txt | 8 ++++++++
interface/vmcs_host/CMakeLists.txt | 8 --------
interface/vmcs_host/vc_vchi_dispmanx.h | 2 +-
diff --git a/recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch b/recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch
index b6a4c58..989f417 100644
--- a/recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch
+++ b/recipes-graphics/userland/files/0010-Fix-for-framerate-with-nested-composition.patch
@@ -7,6 +7,8 @@ frame rate appears irregular and lower than expected when using nested compositi
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/egl/egl_client.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch b/recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch
index 0d8ccd1..691f476 100644
--- a/recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch
+++ b/recipes-graphics/userland/files/0011-build-shared-library-for-vchostif.patch
@@ -7,6 +7,8 @@ Fixes #149
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/vmcs_host/CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch b/recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch
index e652cc2..87d7161 100644
--- a/recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch
+++ b/recipes-graphics/userland/files/0012-implement-buffer-wrapping-interface-for-dispmanx.patch
@@ -7,6 +7,8 @@ Courtesy: Zan Dobersek
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/ext/egl_wayland.c | 42 +++++++++++++++++++++++++++++
interface/wayland/dispmanx.xml | 10 +++++++
2 files changed, 52 insertions(+)
diff --git a/recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch b/recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch
index b60928a..16cbbd7 100644
--- a/recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch
+++ b/recipes-graphics/userland/files/0013-Implement-triple-buffering-for-wayland.patch
@@ -12,6 +12,8 @@ to two vertical intervals
Signed-off-by: Jeff Wannamaker <jeff_wannamaker@cable.comcast.com>
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/egl/egl_client.c | 3 ++-
interface/khronos/egl/egl_client_surface.c | 8 ++++++++
interface/khronos/egl/egl_client_surface.h | 11 +++++++++++
diff --git a/recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch b/recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch
index fa7984c..37ca456 100644
--- a/recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch
+++ b/recipes-graphics/userland/files/0016-Allow-multiple-wayland-compositor-state-data-per-pro.patch
@@ -13,6 +13,8 @@ via embedded composition e.g. westeros
Signed-off-by: Jeff Wannamaker <jeff_wannamaker@cable.comcast.com>
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/common/khrn_client.c | 2 +-
interface/khronos/common/khrn_client.h | 11 +++++-
interface/khronos/ext/egl_wayland.c | 50 ++++++++++++++++++++++----
diff --git a/recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch b/recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch
index 8843489..94566dc 100644
--- a/recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch
+++ b/recipes-graphics/userland/files/0018-Add-EGL_IMG_context_priority-related-defines.patch
@@ -8,6 +8,8 @@ taken from Khronos headers
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/include/EGL/eglext.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch b/recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch
index 841341e..4f91c71 100644
--- a/recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch
+++ b/recipes-graphics/userland/files/0019-libfdt-Undefine-__wordsize-if-already-defined.patch
@@ -8,6 +8,8 @@ for multiple versions of glibc even ones which does not have this define
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
opensrc/helpers/libfdt/libfdt_env.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch b/recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch
index 65fc5eb..e23f4d9 100644
--- a/recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch
+++ b/recipes-graphics/userland/files/0020-openmaxil-add-pkg-config-file.patch
@@ -4,6 +4,8 @@ Date: Wed, 13 Jun 2018 18:22:22 +0000
Subject: [PATCH] openmaxil: add pkg-config file
---
+Upstream-Status: Pending
+
CMakeLists.txt | 2 +-
pkgconfig/openmaxil.pc.in | 10 ++++++++++
2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch b/recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch
index 595eefb..7e3de5f 100644
--- a/recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch
+++ b/recipes-graphics/userland/files/0022-all-host_applications-remove-non-existent-projects.patch
@@ -7,7 +7,7 @@ The ALL_APPS symbol will optionally build an additional set of projects,
however, several of them don't exist anymore. Remove them from the list of
ALL_APPS.
-Upstream-status: submitted [https://github.com/raspberrypi/userland/pull/661]
+Upstream-Status: Submitted [https://github.com/raspberrypi/userland/pull/661]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
host_applications/linux/CMakeLists.txt | 4 ----
diff --git a/recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch b/recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch
index 642ee86..e3b093a 100644
--- a/recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch
+++ b/recipes-graphics/userland/files/0023-hello_pi-optionally-build-wayland-specific-app.patch
@@ -5,7 +5,7 @@ Subject: [PATCH] hello_pi: optionally build wayland-specific app
Only build the wayland-specific hello_pi app when building for wayland.
-Upstream-status: inappropriate [the wayland example is not part of upstream]
+Upstream-Status: Inappropriate [the wayland example is not part of upstream]
Signed-off-by: Trevor Woerner <twoerner@gmail.com>
---
host_applications/linux/apps/hello_pi/CMakeLists.txt | 4 +++-
diff --git a/recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch b/recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch
index ec74cc2..aff95b7 100644
--- a/recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch
+++ b/recipes-graphics/userland/files/0024-userland-Sync-needed-defines-for-weston-build.patch
@@ -8,6 +8,8 @@ therefore import needed defines and typedefs from latest mesa
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
interface/khronos/include/EGL/eglext.h | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/recipes-graphics/userland/files/0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch b/recipes-graphics/userland/files/0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch
new file mode 100644
index 0000000..6f4c722
--- /dev/null
+++ b/recipes-graphics/userland/files/0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch
@@ -0,0 +1,725 @@
+From 8f7fba136391e2020cd0fc9dca76932d3faa21eb Mon Sep 17 00:00:00 2001
+From: Martin Jansa <martin.jansa@gmail.com>
+Date: Fri, 8 Mar 2024 16:29:22 +0100
+Subject: [PATCH] CMakeLists.txt, *.pc: respect CMAKE_INSTALL_LIBDIR
+
+* and CMAKE_INSTALL_BINDIR, CMAKE_INSTALL_INCLUDEDIR as well
+* fixes installation paths with multilib
+ lib32-userland fails with:
+
+ERROR: QA Issue: lib32-userland: Files/directories were installed but not shipped in any package:
+ /usr/lib/libbrcmEGL.so
+ /usr/lib/libvchiq_arm.so
+...
+ /usr/lib/pkgconfig/wayland-egl.pc
+Please set FILES such that these items are packaged. Alternatively if they are unneeded, avoid installing them or delete them within do_install.
+lib32-userland: 66 installed and not shipped files. [installed-vs-shipped]
+
+Signed-off-by: Martin Jansa <martin.jansa@gmail.com>
+---
+Upstream-Status: Pending
+
+ CMakeLists.txt | 2 +-
+ containers/CMakeLists.txt | 2 +-
+ containers/test/CMakeLists.txt | 24 +++++++++----------
+ helpers/dtoverlay/CMakeLists.txt | 2 +-
+ .../linux/apps/dtmerge/CMakeLists.txt | 2 +-
+ .../linux/apps/dtoverlay/CMakeLists.txt | 6 ++---
+ .../linux/apps/gencmd/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_audio/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_dispmanx/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_encode/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_font/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_jpeg/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_teapot/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_tiger/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_triangle/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_triangle2/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_video/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_videocube/CMakeLists.txt | 2 +-
+ .../hello_pi/hello_wayland/CMakeLists.txt | 2 +-
+ .../apps/hello_pi/hello_world/CMakeLists.txt | 2 +-
+ .../linux/apps/raspicam/CMakeLists.txt | 2 +-
+ .../linux/apps/smem/CMakeLists.txt | 2 +-
+ .../linux/apps/tvservice/CMakeLists.txt | 2 +-
+ .../linux/apps/vcmailbox/CMakeLists.txt | 2 +-
+ .../linux/libs/bcm_host/CMakeLists.txt | 2 +-
+ .../linux/libs/debug_sym/CMakeLists.txt | 6 ++---
+ .../linux/libs/sm/CMakeLists.txt | 4 ++--
+ interface/khronos/CMakeLists.txt | 10 ++++----
+ interface/mmal/CMakeLists.txt | 4 ++--
+ interface/mmal/components/CMakeLists.txt | 2 +-
+ interface/mmal/core/CMakeLists.txt | 4 ++--
+ interface/mmal/util/CMakeLists.txt | 4 ++--
+ interface/mmal/vc/CMakeLists.txt | 6 ++---
+ interface/vchiq_arm/CMakeLists.txt | 4 ++--
+ interface/vcos/CMakeLists.txt | 2 +-
+ interface/vcos/generic/CMakeLists.txt | 2 +-
+ interface/vcos/pthreads/CMakeLists.txt | 4 ++--
+ interface/vmcs_host/CMakeLists.txt | 2 +-
+ makefiles/cmake/vmcs.cmake | 2 +-
+ middleware/openmaxil/CMakeLists.txt | 2 +-
+ pkgconfig/bcm_host.pc.in | 2 +-
+ pkgconfig/brcmegl.pc.in | 2 +-
+ pkgconfig/brcmglesv2.pc.in | 2 +-
+ pkgconfig/brcmvg.pc.in | 2 +-
+ pkgconfig/mmal.pc.in | 2 +-
+ pkgconfig/vcsm.pc.in | 2 +-
+ 46 files changed, 73 insertions(+), 73 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 3e3c90e..0bb54b7 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -136,7 +136,7 @@ if(PKG_CONFIG_FOUND)
+ foreach(PCFILE bcm_host.pc brcmegl.pc brcmglesv2.pc brcmvg.pc vcsm.pc mmal.pc openmaxil.pc)
+ configure_file("pkgconfig/${PCFILE}.in" "${PCFILE}" @ONLY)
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PCFILE}"
+- DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
++ DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+ endforeach()
+ endif()
+ # Remove cache entry, if one added by command line
+diff --git a/containers/CMakeLists.txt b/containers/CMakeLists.txt
+index 5570038..6c3d39c 100644
+--- a/containers/CMakeLists.txt
++++ b/containers/CMakeLists.txt
+@@ -66,7 +66,7 @@ set(packetizers_SRCS ${packetizers_SRCS} ${SOURCE_DIR}/h264/avc1_packetizer.c)
+
+ add_library(containers ${LIBRARY_TYPE} ${core_SRCS} ${io_SRCS} ${net_SRCS} ${packetizers_SRCS})
+ target_link_libraries(containers vcos)
+-install(TARGETS containers DESTINATION lib)
++install(TARGETS containers DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+ set(container_readers)
+ set(container_writers)
+diff --git a/containers/test/CMakeLists.txt b/containers/test/CMakeLists.txt
+index 7d36352..832ad0f 100644
+--- a/containers/test/CMakeLists.txt
++++ b/containers/test/CMakeLists.txt
+@@ -1,17 +1,17 @@
+ # Generate test application
+ add_executable(containers_test test.c)
+ target_link_libraries(containers_test -Wl,--no-whole-archive containers)
+-install(TARGETS containers_test DESTINATION bin)
++install(TARGETS containers_test DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate test application
+ add_executable(containers_check_frame_int check_frame_int.c)
+ target_link_libraries(containers_check_frame_int -Wl,--no-whole-archive containers)
+-install(TARGETS containers_check_frame_int DESTINATION bin)
++install(TARGETS containers_check_frame_int DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate autotest application
+ #add_executable(containers_autotest autotest.cpp crc_32.c)
+ #target_link_libraries(containers_autotest -Wl,--no-whole-archive containers})
+-#install(TARGETS containers_autotest DESTINATION bin)
++#install(TARGETS containers_autotest DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Helper code to provide non-blocking console input
+ if (WIN32)
+@@ -28,39 +28,39 @@ add_dependencies(containers_test containers_test_extra)
+ # Generate net test applications
+ add_executable(containers_stream_client stream_client.c ${NB_IO_SOURCE})
+ target_link_libraries(containers_stream_client containers)
+-install(TARGETS containers_stream_client DESTINATION bin)
++install(TARGETS containers_stream_client DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ add_executable(containers_stream_server stream_server.c)
+ target_link_libraries(containers_stream_server containers)
+-install(TARGETS containers_stream_server DESTINATION bin)
++install(TARGETS containers_stream_server DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ add_executable(containers_datagram_sender datagram_sender.c)
+ target_link_libraries(containers_datagram_sender containers)
+-install(TARGETS containers_datagram_sender DESTINATION bin)
++install(TARGETS containers_datagram_sender DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ add_executable(containers_datagram_receiver datagram_receiver.c)
+ target_link_libraries(containers_datagram_receiver containers)
+-install(TARGETS containers_datagram_receiver DESTINATION bin)
++install(TARGETS containers_datagram_receiver DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ add_executable(containers_rtp_decoder rtp_decoder.c ${NB_IO_SOURCE})
+ target_link_libraries(containers_rtp_decoder containers)
+-install(TARGETS containers_rtp_decoder DESTINATION bin)
++install(TARGETS containers_rtp_decoder DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate URI test application
+ add_executable(containers_test_uri test_uri.c)
+ target_link_libraries(containers_test_uri containers)
+-install(TARGETS containers_test_uri DESTINATION bin)
++install(TARGETS containers_test_uri DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate URI pipe application
+ add_executable(containers_uri_pipe uri_pipe.c ${NB_IO_SOURCE})
+ target_link_libraries(containers_uri_pipe containers)
+-install(TARGETS containers_uri_pipe DESTINATION bin)
++install(TARGETS containers_uri_pipe DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate bit stream test application
+ add_executable(containers_test_bits test_bits.c)
+ target_link_libraries(containers_test_bits containers)
+-install(TARGETS containers_test_bits DESTINATION bin)
++install(TARGETS containers_test_bits DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+ # Generate packet file dump application
+ add_executable(containers_dump_pktfile dump_pktfile.c)
+-install(TARGETS containers_dump_pktfile DESTINATION bin)
++install(TARGETS containers_dump_pktfile DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/helpers/dtoverlay/CMakeLists.txt b/helpers/dtoverlay/CMakeLists.txt
+index b3bd30f..7e83780 100644
+--- a/helpers/dtoverlay/CMakeLists.txt
++++ b/helpers/dtoverlay/CMakeLists.txt
+@@ -22,4 +22,4 @@ add_library (dtovl ${SHARED}
+
+ target_link_libraries(dtovl fdt)
+
+-install (TARGETS dtovl DESTINATION lib)
++install (TARGETS dtovl DESTINATION ${CMAKE_INSTALL_LIBDIR})
+diff --git a/host_applications/linux/apps/dtmerge/CMakeLists.txt b/host_applications/linux/apps/dtmerge/CMakeLists.txt
+index d3f7e36..daa91e5 100755
+--- a/host_applications/linux/apps/dtmerge/CMakeLists.txt
++++ b/host_applications/linux/apps/dtmerge/CMakeLists.txt
+@@ -17,5 +17,5 @@ include_directories (
+ add_executable(dtmerge dtmerge.c)
+ target_link_libraries(dtmerge dtovl)
+
+-install(TARGETS dtmerge RUNTIME DESTINATION bin)
++install(TARGETS dtmerge RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES dtmerge.1 DESTINATION man/man1)
+diff --git a/host_applications/linux/apps/dtoverlay/CMakeLists.txt b/host_applications/linux/apps/dtoverlay/CMakeLists.txt
+index 97bcadc..238296d 100755
+--- a/host_applications/linux/apps/dtoverlay/CMakeLists.txt
++++ b/host_applications/linux/apps/dtoverlay/CMakeLists.txt
+@@ -16,12 +16,12 @@ include_directories (
+
+ add_executable(dtoverlay dtoverlay_main.c utils.c)
+ target_link_libraries(dtoverlay dtovl)
+-install(TARGETS dtoverlay RUNTIME DESTINATION bin)
++install(TARGETS dtoverlay RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES dtoverlay.1 DESTINATION man/man1)
+
+ add_custom_command(TARGET dtoverlay POST_BUILD COMMAND ln;-sf;dtoverlay;dtparam)
+-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/dtparam DESTINATION bin)
++install(FILES ${CMAKE_CURRENT_BINARY_DIR}/dtparam DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES dtparam.1 DESTINATION man/man1)
+
+ set(DTOVERLAY_SCRIPTS dtoverlay-pre dtoverlay-post)
+-install(PROGRAMS ${DTOVERLAY_SCRIPTS} DESTINATION bin)
++install(PROGRAMS ${DTOVERLAY_SCRIPTS} DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/gencmd/CMakeLists.txt b/host_applications/linux/apps/gencmd/CMakeLists.txt
+index 0c2c32a..fdd2f00 100644
+--- a/host_applications/linux/apps/gencmd/CMakeLists.txt
++++ b/host_applications/linux/apps/gencmd/CMakeLists.txt
+@@ -16,5 +16,5 @@ include_directories( ../../../..
+
+ add_executable(vcgencmd gencmd.c)
+ target_link_libraries(vcgencmd vcos vchiq_arm vchostif)
+-install(TARGETS vcgencmd RUNTIME DESTINATION bin)
++install(TARGETS vcgencmd RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES vcgencmd.1 DESTINATION man/man1)
+diff --git a/host_applications/linux/apps/hello_pi/hello_audio/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_audio/CMakeLists.txt
+index 03207c5..8f4d06c 100644
+--- a/host_applications/linux/apps/hello_pi/hello_audio/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_audio/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_dispmanx/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_dispmanx/CMakeLists.txt
+index 0471a1d..fd8b85e 100644
+--- a/host_applications/linux/apps/hello_pi/hello_dispmanx/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_dispmanx/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_encode/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_encode/CMakeLists.txt
+index 147623b..98a197a 100644
+--- a/host_applications/linux/apps/hello_pi/hello_encode/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_encode/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_font/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_font/CMakeLists.txt
+index 448d2cf..1d89f4c 100644
+--- a/host_applications/linux/apps/hello_pi/hello_font/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_font/CMakeLists.txt
+@@ -6,4 +6,4 @@ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+ target_link_libraries(${EXEC} vgfont freetype z)
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_jpeg/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_jpeg/CMakeLists.txt
+index a56dda5..f611f8e 100644
+--- a/host_applications/linux/apps/hello_pi/hello_jpeg/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_jpeg/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_teapot/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_teapot/CMakeLists.txt
+index cdb8413..a60da3e 100644
+--- a/host_applications/linux/apps/hello_pi/hello_teapot/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_teapot/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_tiger/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_tiger/CMakeLists.txt
+index b253f3f..1104a8b 100644
+--- a/host_applications/linux/apps/hello_pi/hello_tiger/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_tiger/CMakeLists.txt
+@@ -6,4 +6,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_triangle/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_triangle/CMakeLists.txt
+index 4e8128e..4b738bb 100644
+--- a/host_applications/linux/apps/hello_pi/hello_triangle/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_triangle/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_triangle2/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_triangle2/CMakeLists.txt
+index 390980a..c8c534f 100644
+--- a/host_applications/linux/apps/hello_pi/hello_triangle2/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_triangle2/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_video/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_video/CMakeLists.txt
+index 42187af..6b15ca2 100644
+--- a/host_applications/linux/apps/hello_pi/hello_video/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_video/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_videocube/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_videocube/CMakeLists.txt
+index d7fb059..9612ffe 100644
+--- a/host_applications/linux/apps/hello_pi/hello_videocube/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_videocube/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_wayland/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_wayland/CMakeLists.txt
+index 9a2f75c..9a468a6 100644
+--- a/host_applications/linux/apps/hello_pi/hello_wayland/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_wayland/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS} -lwayland-client -lwayland-egl)
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/hello_pi/hello_world/CMakeLists.txt b/host_applications/linux/apps/hello_pi/hello_world/CMakeLists.txt
+index b0120fe..97d90f6 100644
+--- a/host_applications/linux/apps/hello_pi/hello_world/CMakeLists.txt
++++ b/host_applications/linux/apps/hello_pi/hello_world/CMakeLists.txt
+@@ -5,4 +5,4 @@ add_executable(${EXEC} ${SRCS})
+ target_link_libraries(${EXEC} ${HELLO_PI_LIBS})
+
+ install(TARGETS ${EXEC}
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/host_applications/linux/apps/raspicam/CMakeLists.txt b/host_applications/linux/apps/raspicam/CMakeLists.txt
+index f73a4d0..4a9cd88 100644
+--- a/host_applications/linux/apps/raspicam/CMakeLists.txt
++++ b/host_applications/linux/apps/raspicam/CMakeLists.txt
+@@ -66,6 +66,6 @@ target_link_libraries(raspiyuv ${MMAL_LIBS} vcos bcm_host m)
+ target_link_libraries(raspivid ${MMAL_LIBS} vcos bcm_host m)
+ target_link_libraries(raspividyuv ${MMAL_LIBS} vcos bcm_host m)
+
+-install(TARGETS raspistill raspiyuv raspivid raspividyuv RUNTIME DESTINATION bin)
++install(TARGETS raspistill raspiyuv raspivid raspividyuv RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES raspistill.1 raspiyuv.1 raspivid.1 raspividyuv.1 DESTINATION man/man1)
+ install(FILES raspicam.7 DESTINATION man/man7)
+diff --git a/host_applications/linux/apps/smem/CMakeLists.txt b/host_applications/linux/apps/smem/CMakeLists.txt
+index 0fa8328..60c9c61 100644
+--- a/host_applications/linux/apps/smem/CMakeLists.txt
++++ b/host_applications/linux/apps/smem/CMakeLists.txt
+@@ -16,5 +16,5 @@ include_directories (
+ add_executable(vcsmem smem.c)
+ target_link_libraries(vcsmem vcos vcsm vchostif)
+
+-install(TARGETS vcsmem RUNTIME DESTINATION bin)
++install(TARGETS vcsmem RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+diff --git a/host_applications/linux/apps/tvservice/CMakeLists.txt b/host_applications/linux/apps/tvservice/CMakeLists.txt
+index 0190774..fad5a6b 100644
+--- a/host_applications/linux/apps/tvservice/CMakeLists.txt
++++ b/host_applications/linux/apps/tvservice/CMakeLists.txt
+@@ -3,5 +3,5 @@ add_executable(tvservice tvservice.c)
+ target_link_libraries(tvservice vchostif bcm_host)
+
+ install(TARGETS tvservice
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES tvservice.1 DESTINATION man/man1)
+diff --git a/host_applications/linux/apps/vcmailbox/CMakeLists.txt b/host_applications/linux/apps/vcmailbox/CMakeLists.txt
+index d153363..2731724 100644
+--- a/host_applications/linux/apps/vcmailbox/CMakeLists.txt
++++ b/host_applications/linux/apps/vcmailbox/CMakeLists.txt
+@@ -2,6 +2,6 @@ add_executable(vcmailbox vcmailbox.c)
+ target_link_libraries(vcmailbox vchostif)
+
+ install(TARGETS vcmailbox
+- RUNTIME DESTINATION bin)
++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ install(FILES vcmailbox.1 DESTINATION man/man1)
+ install(FILES vcmailbox.7 raspiotp.7 raspirev.7 DESTINATION man/man7)
+diff --git a/host_applications/linux/libs/bcm_host/CMakeLists.txt b/host_applications/linux/libs/bcm_host/CMakeLists.txt
+index 7a4ab06..3614943 100644
+--- a/host_applications/linux/libs/bcm_host/CMakeLists.txt
++++ b/host_applications/linux/libs/bcm_host/CMakeLists.txt
+@@ -19,5 +19,5 @@ add_library(bcm_host ${SHARED} bcm_host.c)
+
+ target_link_libraries(bcm_host vcos vchostif)
+
+-install(TARGETS bcm_host DESTINATION lib)
++install(TARGETS bcm_host DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+diff --git a/host_applications/linux/libs/debug_sym/CMakeLists.txt b/host_applications/linux/libs/debug_sym/CMakeLists.txt
+index d437b99..37eb759 100644
+--- a/host_applications/linux/libs/debug_sym/CMakeLists.txt
++++ b/host_applications/linux/libs/debug_sym/CMakeLists.txt
+@@ -11,6 +11,6 @@ include_directories (
+ add_library(debug_sym ${SHARED} debug_sym.c)
+ add_library(debug_sym_static STATIC debug_sym.c)
+
+-install(TARGETS debug_sym DESTINATION lib)
+-install(TARGETS debug_sym_static DESTINATION lib)
+-install(FILES debug_sym.h DESTINATION include/interface/debug_sym)
++install(TARGETS debug_sym DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(TARGETS debug_sym_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(FILES debug_sym.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/debug_sym)
+diff --git a/host_applications/linux/libs/sm/CMakeLists.txt b/host_applications/linux/libs/sm/CMakeLists.txt
+index 5ce5aca..84d8123 100644
+--- a/host_applications/linux/libs/sm/CMakeLists.txt
++++ b/host_applications/linux/libs/sm/CMakeLists.txt
+@@ -14,5 +14,5 @@ add_library(vcsm ${SHARED} user-vcsm.c)
+
+ target_link_libraries(vcsm vcos)
+
+-install(TARGETS vcsm DESTINATION lib)
+-install(FILES user-vcsm.h DESTINATION include/interface/vcsm)
++install(TARGETS vcsm DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(FILES user-vcsm.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/vcsm)
+diff --git a/interface/khronos/CMakeLists.txt b/interface/khronos/CMakeLists.txt
+index 95c0e11..00316a5 100644
+--- a/interface/khronos/CMakeLists.txt
++++ b/interface/khronos/CMakeLists.txt
+@@ -94,11 +94,11 @@ if (BUILD_WAYLAND)
+ )
+
+ add_library(wayland-egl ${SHARED} ${WAYLAND_EGL_SOURCE})
+- install(TARGETS wayland-egl DESTINATION lib)
++ install(TARGETS wayland-egl DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+ configure_file ("wayland-egl/wayland-egl.pc.in" "wayland-egl/wayland-egl.pc" @ONLY)
+ install (FILES "${CMAKE_CURRENT_BINARY_DIR}/wayland-egl/wayland-egl.pc"
+- DESTINATION lib/pkgconfig)
++ DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+ endif ()
+
+ add_library(EGL ${SHARED} ${EGL_SOURCE})
+@@ -126,8 +126,8 @@ target_link_libraries(GLESv2 EGL khrn_client vcos)
+ target_link_libraries(WFC EGL)
+ target_link_libraries(OpenVG EGL)
+
+-install(TARGETS EGL GLESv2 OpenVG WFC khrn_client DESTINATION lib)
+-install(TARGETS EGL_static GLESv2_static khrn_static DESTINATION lib)
++install(TARGETS EGL GLESv2 OpenVG WFC khrn_client DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(TARGETS EGL_static GLESv2_static khrn_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+ # recommended names to use to avoid conflicts with mesa libs
+ add_library(brcmEGL ${SHARED} ${EGL_SOURCE})
+@@ -140,4 +140,4 @@ target_link_libraries(brcmGLESv2 brcmEGL khrn_client vcos)
+ target_link_libraries(brcmWFC brcmEGL)
+ target_link_libraries(brcmOpenVG brcmEGL)
+
+-install(TARGETS brcmEGL brcmGLESv2 brcmOpenVG brcmWFC DESTINATION lib)
++install(TARGETS brcmEGL brcmGLESv2 brcmOpenVG brcmWFC DESTINATION ${CMAKE_INSTALL_LIBDIR})
+diff --git a/interface/mmal/CMakeLists.txt b/interface/mmal/CMakeLists.txt
+index c5c1642..fe784e8 100644
+--- a/interface/mmal/CMakeLists.txt
++++ b/interface/mmal/CMakeLists.txt
+@@ -16,7 +16,7 @@ add_subdirectory(client)
+
+ target_link_libraries(mmal mmal_core mmal_util mmal_vc_client vcos mmal_components)
+
+-install(TARGETS mmal DESTINATION lib)
++install(TARGETS mmal DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ install(FILES
+ mmal.h
+ mmal_buffer.h
+@@ -36,7 +36,7 @@ install(FILES
+ mmal_pool.h mmal_port.h
+ mmal_queue.h
+ mmal_types.h
+- DESTINATION include/interface/mmal
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/mmal
+ )
+
+ # Test apps
+diff --git a/interface/mmal/components/CMakeLists.txt b/interface/mmal/components/CMakeLists.txt
+index d65fa37..4c85de0 100644
+--- a/interface/mmal/components/CMakeLists.txt
++++ b/interface/mmal/components/CMakeLists.txt
+@@ -30,5 +30,5 @@ set(container_libs ${container_libs} containers)
+ target_link_libraries(mmal_components ${container_libs} mmal_util)
+ target_link_libraries(mmal_components mmal_core)
+
+-install(TARGETS mmal_components DESTINATION lib)
++install(TARGETS mmal_components DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+diff --git a/interface/mmal/core/CMakeLists.txt b/interface/mmal/core/CMakeLists.txt
+index efa14d9..4fe0779 100644
+--- a/interface/mmal/core/CMakeLists.txt
++++ b/interface/mmal/core/CMakeLists.txt
+@@ -13,7 +13,7 @@ add_library (mmal_core ${LIBRARY_TYPE}
+
+ target_link_libraries (mmal_core vcos mmal_vc_client)
+
+-install(TARGETS mmal_core DESTINATION lib)
++install(TARGETS mmal_core DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ install(FILES
+ mmal_buffer_private.h
+ mmal_clock_private.h
+@@ -21,5 +21,5 @@ install(FILES
+ mmal_core_private.h
+ mmal_port_private.h
+ mmal_events_private.h
+- DESTINATION include/interface/mmal/core
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/mmal/core
+ )
+diff --git a/interface/mmal/util/CMakeLists.txt b/interface/mmal/util/CMakeLists.txt
+index b2a6858..e51afd0 100644
+--- a/interface/mmal/util/CMakeLists.txt
++++ b/interface/mmal/util/CMakeLists.txt
+@@ -12,7 +12,7 @@ add_library (mmal_util ${LIBRARY_TYPE}
+
+ target_link_libraries (mmal_util vcos)
+
+-install(TARGETS mmal_util DESTINATION lib)
++install(TARGETS mmal_util DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ install(FILES
+ mmal_component_wrapper.h
+ mmal_connection.h
+@@ -24,5 +24,5 @@ install(FILES
+ mmal_util.h
+ mmal_util_params.h
+ mmal_util_rational.h
+- DESTINATION include/interface/mmal/util
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/mmal/util
+ )
+diff --git a/interface/mmal/vc/CMakeLists.txt b/interface/mmal/vc/CMakeLists.txt
+index d6e80db..3b9ec64 100644
+--- a/interface/mmal/vc/CMakeLists.txt
++++ b/interface/mmal/vc/CMakeLists.txt
+@@ -8,12 +8,12 @@ target_link_libraries(mmal_vc_client vchiq_arm vcos vcsm)
+ if(BUILD_MMAL_APPS)
+ add_executable(mmal_vc_diag mmal_vc_diag.c)
+ target_link_libraries(mmal_vc_diag mmal mmal_vc_client debug_sym vcos)
+-install(TARGETS mmal_vc_diag RUNTIME DESTINATION bin)
++install(TARGETS mmal_vc_diag RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ endif(BUILD_MMAL_APPS)
+
+ include_directories ( ../../../host_applications/linux/libs/sm )
+
+-install(TARGETS mmal_vc_client DESTINATION lib)
++install(TARGETS mmal_vc_client DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ install(FILES
+ mmal_vc_api.h
+ mmal_vc_api_drm.h
+@@ -22,5 +22,5 @@ install(FILES
+ mmal_vc_msgs.h
+ mmal_vc_opaque_alloc.h
+ mmal_vc_shm.h
+- DESTINATION include/interface/mmal/vc
++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/mmal/vc
+ )
+diff --git a/interface/vchiq_arm/CMakeLists.txt b/interface/vchiq_arm/CMakeLists.txt
+index 7af383d..e5a3224 100644
+--- a/interface/vchiq_arm/CMakeLists.txt
++++ b/interface/vchiq_arm/CMakeLists.txt
+@@ -5,7 +5,7 @@ add_library(vchiq_arm SHARED
+ # pull in VCHI cond variable emulation
+ target_link_libraries(vchiq_arm vcos)
+
+-install(TARGETS vchiq_arm DESTINATION lib)
++install(TARGETS vchiq_arm DESTINATION ${CMAKE_INSTALL_LIBDIR})
+ #install(FILES etc/10-vchiq.rules DESTINATION /etc/udev/rules.d)
+
+ include_directories(../..)
+@@ -17,4 +17,4 @@ target_link_libraries(vchiq_test
+ vchiq_arm
+ vcos)
+
+-install(TARGETS vchiq_test RUNTIME DESTINATION bin)
++install(TARGETS vchiq_test RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+diff --git a/interface/vcos/CMakeLists.txt b/interface/vcos/CMakeLists.txt
+index 23a8d72..b0924a4 100644
+--- a/interface/vcos/CMakeLists.txt
++++ b/interface/vcos/CMakeLists.txt
+@@ -65,4 +65,4 @@ if (WIN32)
+ configure_file (build_all.bat.in build_all.bat @ONLY)
+ endif ()
+
+-#install (FILES ${HEADERS} DESTINATION include/interface/vcos)
++#install (FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/vcos)
+diff --git a/interface/vcos/generic/CMakeLists.txt b/interface/vcos/generic/CMakeLists.txt
+index c09f376..8af98fd 100644
+--- a/interface/vcos/generic/CMakeLists.txt
++++ b/interface/vcos/generic/CMakeLists.txt
+@@ -18,4 +18,4 @@ foreach (header ${HEADERS})
+ configure_file ("${header}" "${VCOS_HEADERS_BUILD_DIR}/generic/${header}" COPYONLY)
+ endforeach ()
+
+-install (FILES ${HEADERS} DESTINATION include/interface/vcos/generic)
++install (FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/interface/vcos/generic)
+diff --git a/interface/vcos/pthreads/CMakeLists.txt b/interface/vcos/pthreads/CMakeLists.txt
+index d6cd415..821b3f3 100644
+--- a/interface/vcos/pthreads/CMakeLists.txt
++++ b/interface/vcos/pthreads/CMakeLists.txt
+@@ -50,5 +50,5 @@ else ()
+ endif ()
+
+
+-#install(FILES ${HEADERS} DESTINATION include)
+-install(TARGETS vcos DESTINATION lib)
++#install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(TARGETS vcos DESTINATION ${CMAKE_INSTALL_LIBDIR})
+diff --git a/interface/vmcs_host/CMakeLists.txt b/interface/vmcs_host/CMakeLists.txt
+index 76813c9..0984d8a 100755
+--- a/interface/vmcs_host/CMakeLists.txt
++++ b/interface/vmcs_host/CMakeLists.txt
+@@ -35,5 +35,5 @@ target_link_libraries(vchostif vchiq_arm vcos)
+
+ #target_link_libraries(bufman WFC)
+
+-install(TARGETS ${INSTALL_TARGETS} DESTINATION lib)
++install(TARGETS ${INSTALL_TARGETS} DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+diff --git a/makefiles/cmake/vmcs.cmake b/makefiles/cmake/vmcs.cmake
+index 7c97463..a1eb911 100644
+--- a/makefiles/cmake/vmcs.cmake
++++ b/makefiles/cmake/vmcs.cmake
+@@ -16,7 +16,7 @@ endif()
+ SET(CMAKE_INSTALL_PREFIX "${VMCS_INSTALL_PREFIX}" CACHE INTERNAL "Prefix
+ prepended to install directories" FORCE)
+ if(NOT DEFINED VMCS_PLUGIN_DIR)
+- SET(VMCS_PLUGIN_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_SHARED_LIBRARY_PREFIX}/plugins)
++ SET(VMCS_PLUGIN_DIR ${CMAKE_INSTALL_LIBDIR}/plugins)
+ endif()
+
+ # What kind of system are we?
+diff --git a/middleware/openmaxil/CMakeLists.txt b/middleware/openmaxil/CMakeLists.txt
+index 3e9c5f9..c063740 100644
+--- a/middleware/openmaxil/CMakeLists.txt
++++ b/middleware/openmaxil/CMakeLists.txt
+@@ -49,4 +49,4 @@ else ()
+
+ endif ()
+
+-install (TARGETS openmaxil DESTINATION lib)
++install (TARGETS openmaxil DESTINATION ${CMAKE_INSTALL_LIBDIR})
+diff --git a/pkgconfig/bcm_host.pc.in b/pkgconfig/bcm_host.pc.in
+index c7237c5..2988b42 100644
+--- a/pkgconfig/bcm_host.pc.in
++++ b/pkgconfig/bcm_host.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: bcm_host
+diff --git a/pkgconfig/brcmegl.pc.in b/pkgconfig/brcmegl.pc.in
+index 5dd3d5b..a45bf22 100644
+--- a/pkgconfig/brcmegl.pc.in
++++ b/pkgconfig/brcmegl.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: brcmEGL
+diff --git a/pkgconfig/brcmglesv2.pc.in b/pkgconfig/brcmglesv2.pc.in
+index e0e36f5..902fbf3 100644
+--- a/pkgconfig/brcmglesv2.pc.in
++++ b/pkgconfig/brcmglesv2.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: brcmGLESv2
+diff --git a/pkgconfig/brcmvg.pc.in b/pkgconfig/brcmvg.pc.in
+index 763a44b..98489ee 100644
+--- a/pkgconfig/brcmvg.pc.in
++++ b/pkgconfig/brcmvg.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: brcmOpenVG
+diff --git a/pkgconfig/mmal.pc.in b/pkgconfig/mmal.pc.in
+index 37d344c..1ffa4f5 100644
+--- a/pkgconfig/mmal.pc.in
++++ b/pkgconfig/mmal.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: MMAL
+diff --git a/pkgconfig/vcsm.pc.in b/pkgconfig/vcsm.pc.in
+index b12c56f..6f762cb 100644
+--- a/pkgconfig/vcsm.pc.in
++++ b/pkgconfig/vcsm.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
++libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+ includedir=${prefix}/include
+
+ Name: VCSM
diff --git a/recipes-graphics/userland/userland_git.bb b/recipes-graphics/userland/userland_git.bb
index ead81f0..bd50bf8 100644
--- a/recipes-graphics/userland/userland_git.bb
+++ b/recipes-graphics/userland/userland_git.bb
@@ -8,19 +8,20 @@ LIC_FILES_CHKSUM = "file://LICENCE;md5=0448d6488ef8cc380632b1569ee6d196"
PROVIDES += "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "", "virtual/libgles2 virtual/egl", d)}"
PROVIDES += "virtual/libomxil"
-RPROVIDES_${PN} += "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "", "libgles2 egl libegl libegl1 libglesv2-2", d)}"
+RPROVIDES:${PN} += "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "", "libgles2 egl libegl libegl1 libglesv2-2", d)}"
COMPATIBLE_MACHINE = "^rpi$"
SRCBRANCH = "master"
SRCFORK = "raspberrypi"
-SRCREV = "3fd8527eefd8790b4e8393458efc5f94eb21a615"
+SRCREV = "cc1ca18fb0689b01cc2ca2aa4b400dcee624a213"
# Use the date of the above commit as the package version. Update this when
# SRCREV is changed.
-PV = "20210319"
+PV = "20230419"
SRC_URI = "\
- git://github.com/${SRCFORK}/userland.git;protocol=git;branch=${SRCBRANCH} \
+ git://github.com/${SRCFORK}/userland.git;protocol=https;branch=${SRCBRANCH} \
+ file://0001-mmal-Do-not-use-Werror.patch \
file://0001-Allow-applications-to-set-next-resource-handle.patch \
file://0002-wayland-Add-support-for-the-Wayland-winsys.patch \
file://0003-wayland-Add-Wayland-example.patch \
@@ -45,9 +46,10 @@ SRC_URI = "\
file://0022-all-host_applications-remove-non-existent-projects.patch \
file://0023-hello_pi-optionally-build-wayland-specific-app.patch \
file://0024-userland-Sync-needed-defines-for-weston-build.patch \
+ file://0025-CMakeLists.txt-.pc-respect-CMAKE_INSTALL_LIBDIR.patch \
"
-SRC_URI_remove_toolchain-clang = "file://0021-cmake-Disable-format-overflow-warning-as-error.patch"
+SRC_URI:remove:toolchain-clang = "file://0021-cmake-Disable-format-overflow-warning-as-error.patch"
S = "${WORKDIR}/git"
@@ -59,7 +61,7 @@ EXTRA_OECMAKE = "-DCMAKE_BUILD_TYPE=Release -DCMAKE_EXE_LINKER_FLAGS='-Wl,--no-a
-DVMCS_INSTALL_PREFIX=${exec_prefix} \
"
-EXTRA_OECMAKE_append_aarch64 = " -DARM64=ON "
+EXTRA_OECMAKE:append:aarch64 = " -DARM64=ON "
PACKAGECONFIG ?= "${@bb.utils.contains('DISTRO_FEATURES', 'wayland', 'wayland', '', d)}"
@@ -67,9 +69,9 @@ PACKAGECONFIG ?= "${@bb.utils.contains('DISTRO_FEATURES', 'wayland', 'wayland',
PACKAGECONFIG[wayland] = "-DBUILD_WAYLAND=TRUE -DWAYLAND_SCANNER_EXECUTABLE:FILEPATH=${STAGING_BINDIR_NATIVE}/wayland-scanner,,wayland-native wayland"
PACKAGECONFIG[allapps] = "-DALL_APPS=true,,,"
-CFLAGS_append = " -fPIC"
+CFLAGS:append = " -fPIC -Wno-unused-but-set-variable"
-do_install_append () {
+do_install:append () {
for f in `find ${D}${includedir}/interface/vcos/ -name "*.h"`; do
sed -i 's/include "vcos_platform.h"/include "pthreads\/vcos_platform.h"/g' ${f}
sed -i 's/include "vcos_futex_mutex.h"/include "pthreads\/vcos_futex_mutex.h"/g' ${f}
@@ -88,21 +90,25 @@ do_install_append () {
ln -sf brcmegl.pc ${D}${libdir}/pkgconfig/egl.pc
ln -sf brcmvg.pc ${D}${libdir}/pkgconfig/vg.pc
fi
+ # Currently man files are installed in /usr/man instead of /usr/share/man, see comments in:
+ # https://github.com/raspberrypi/userland/commit/45a0022ac64b4d0788def3c5230c972430f6fc23
+ mkdir -pv ${D}${datadir}
+ mv -v ${D}${prefix}/man ${D}${mandir}
}
# Shared libs from userland package build aren't versioned, so we need
# to force the .so files into the runtime package (and keep them
# out of -dev package).
FILES_SOLIBSDEV = ""
-INSANE_SKIP_${PN} += "dev-so"
+INSANE_SKIP:${PN} += "dev-so"
-FILES_${PN} += " \
+FILES:${PN} += " \
${libdir}/*.so \
${libdir}/plugins"
-FILES_${PN}-dev += "${includedir} \
+FILES:${PN}-dev += "${includedir} \
${prefix}/src"
-FILES_${PN}-doc += "${datadir}/install"
-FILES_${PN}-dbg += "${libdir}/plugins/.debug"
+FILES:${PN}-doc += "${datadir}/install"
+FILES:${PN}-dbg += "${libdir}/plugins/.debug"
-RDEPENDS_${PN} += "bash"
-RDEPENDS_${PN} += "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "libegl-mesa", "", d)}"
+RDEPENDS:${PN} += "bash"
+RDEPENDS:${PN} += "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "libegl-mesa", "", d)}"
diff --git a/recipes-graphics/vc-graphics/vc-graphics.inc b/recipes-graphics/vc-graphics/vc-graphics.inc
index 540e289..e5e8f53 100644
--- a/recipes-graphics/vc-graphics/vc-graphics.inc
+++ b/recipes-graphics/vc-graphics/vc-graphics.inc
@@ -40,15 +40,15 @@ do_install () {
}
# These are proprietary binaries generated elsewhere so don't check ldflags
-INSANE_SKIP_${PN} = "ldflags"
+INSANE_SKIP:${PN} = "ldflags"
INITSCRIPT_NAME = "vchiq.sh"
INITSCRIPT_PARAMS = "start 03 S ."
-FILES_${PN} = "${bindir}/* \
+FILES:${PN} = "${bindir}/* \
${libdir}/lib*.so \
${sysconfdir}/init.d \
${libdir}/plugins"
-FILES_${PN}-dev = "${libdir}/pkgconfig \
+FILES:${PN}-dev = "${libdir}/pkgconfig \
${includedir}"
-FILES_${PN}-dbg += "${libdir}/plugins/.debug"
+FILES:${PN}-dbg += "${libdir}/plugins/.debug"
diff --git a/recipes-graphics/wayland/wayland_%.bbappend b/recipes-graphics/wayland/wayland_%.bbappend
index e5bbf4a..ca2b296 100644
--- a/recipes-graphics/wayland/wayland_%.bbappend
+++ b/recipes-graphics/wayland/wayland_%.bbappend
@@ -1,5 +1,5 @@
# until fully tested, prefer `libwayland-egl` provided by `userland` instead of `wayland` when not using vc4graphics
-do_install_append_rpi () {
+do_install:append:rpi () {
if [ "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "1", "0", d)}" = "0" ]; then
rm -f ${D}${libdir}/libwayland-egl*
rm -f ${D}${libdir}/pkgconfig/wayland-egl.pc
diff --git a/recipes-graphics/wayland/weston-init.bbappend b/recipes-graphics/wayland/weston-init.bbappend
new file mode 100644
index 0000000..b6d6f2e
--- /dev/null
+++ b/recipes-graphics/wayland/weston-init.bbappend
@@ -0,0 +1,10 @@
+FILESEXTRAPATHS:prepend := "${THISDIR}/${PN}:"
+
+do_install:append:rpi() {
+ if [ -e ${D}/${sysconfdir}/init.d/weston ]; then
+ sed -i 's#weston-start --#weston-start -- --continue-without-input#' ${D}/${sysconfdir}/init.d/weston
+ fi
+ if [ -e ${D}${systemd_system_unitdir}/weston.service ]; then
+ sed -i 's#ExecStart=/usr/bin/weston#ExecStart=/usr/bin/weston --continue-without-input#' ${D}${systemd_system_unitdir}/weston.service
+ fi
+}
diff --git a/recipes-graphics/wayland/weston_%.bbappend b/recipes-graphics/wayland/weston_%.bbappend
index 55cf700..f9ed06a 100644
--- a/recipes-graphics/wayland/weston_%.bbappend
+++ b/recipes-graphics/wayland/weston_%.bbappend
@@ -1,6 +1,6 @@
-PACKAGECONFIG_remove_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', 'fbdev', '', d)}"
+PACKAGECONFIG:remove:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', 'fbdev', 'egl clients', d)}"
-EXTRA_OECONF_append_rpi = " \
+EXTRA_OECONF:append:rpi = " \
--disable-xwayland-test \
--disable-simple-egl-clients \
${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' \
diff --git a/recipes-graphics/xorg-xserver/xserver-xf86-config_%.bbappend b/recipes-graphics/xorg-xserver/xserver-xf86-config_%.bbappend
index 71e0adc..dfcd072 100644
--- a/recipes-graphics/xorg-xserver/xserver-xf86-config_%.bbappend
+++ b/recipes-graphics/xorg-xserver/xserver-xf86-config_%.bbappend
@@ -1,10 +1,10 @@
-FILESEXTRAPATHS_prepend := "${THISDIR}/${PN}:"
+FILESEXTRAPATHS:prepend := "${THISDIR}/${PN}:"
-SRC_URI_append_rpi = " \
+SRC_URI:append:rpi = " \
file://xorg.conf.d/98-pitft.conf \
file://xorg.conf.d/99-calibration.conf \
"
-do_install_append_rpi () {
+do_install:append:rpi () {
PITFT="${@bb.utils.contains("MACHINE_FEATURES", "pitft", "1", "0", d)}"
if [ "${PITFT}" = "1" ]; then
install -d ${D}/${sysconfdir}/X11/xorg.conf.d/
@@ -13,4 +13,4 @@ do_install_append_rpi () {
fi
}
-FILES_${PN}_rpi += "${sysconfdir}/X11/xorg.conf ${sysconfdir}/X11/xorg.conf.d/*"
+FILES:${PN}:append:rpi = " ${sysconfdir}/X11/xorg.conf.d/*"
diff --git a/recipes-graphics/xorg-xserver/xserver-xorg_%.bbappend b/recipes-graphics/xorg-xserver/xserver-xorg_%.bbappend
index 9574fa5..ee4812f 100644
--- a/recipes-graphics/xorg-xserver/xserver-xorg_%.bbappend
+++ b/recipes-graphics/xorg-xserver/xserver-xorg_%.bbappend
@@ -1,5 +1,5 @@
-OPENGL_PKGCONFIGS_rpi = "dri glx ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', 'dri3 xshmfence glamor', '', d)}"
+OPENGL_PKGCONFIGS:rpi = "dri glx ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', 'dri3 glamor', '', d)}"
# when using userland graphic KHR/khrplatform.h is provided by userland but virtual/libgl is provided by mesa-gl where
# we explicitly delete KHR/khrplatform.h since its already coming from userland package
-DEPENDS_append_rpi = " ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'userland', d)}"
+DEPENDS:append:rpi = " ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'userland', d)}"
diff --git a/recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb b/recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb
index d28e2c1..bd5ed62 100644
--- a/recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb
+++ b/recipes-kernel/bluez-firmware-rpidistro/bluez-firmware-rpidistro_git.bb
@@ -16,16 +16,18 @@ SECTION = "kernel"
# [^1]: https://github.com/RPi-Distro/bluez-firmware/issues/1
LICENSE = "Firmware-cypress-rpidistro"
LIC_FILES_CHKSUM = "\
- file://LICENCE.cypress-rpidistro;md5=c5d12ae0b24ef7177902a8e288751a4e \
+ file://LICENCE.cypress-rpidistro;md5=be80828daf682762f392131141288a74 \
"
# These are not common licenses, set NO_GENERIC_LICENSE for them
# so that the license files will be copied from fetched source
NO_GENERIC_LICENSE[Firmware-cypress-rpidistro] = "LICENCE.cypress-rpidistro"
-SRC_URI = "git://github.com/RPi-Distro/bluez-firmware"
-SRCREV = "e7fd166981ab4bb9a36c2d1500205a078a35714d"
-PV = "1.2-4+rpt8"
+SRC_URI = " \
+ git://github.com/RPi-Distro/bluez-firmware;branch=bookworm;protocol=https \
+"
+SRCREV = "78d6a07730e2d20c035899521ab67726dc028e1c"
+PV = "1.2-9+rpt3"
S = "${WORKDIR}/git"
@@ -39,7 +41,7 @@ do_extract_lic() {
}
# Must be before both do_install and do_populate_lic. Putting it before
# their common ancestor works; other approaches do not.
-addtask extract_lic after do_unpack before do_patch
+addtask extract_lic after do_unpack before do_patch do_create_spdx
do_compile() {
:
@@ -49,51 +51,59 @@ do_install() {
install -d ${D}${nonarch_base_libdir}/firmware/brcm
cp LICENCE.cypress-rpidistro ${D}${nonarch_base_libdir}/firmware
- install -m 0644 broadcom/BCM434*.hcd ${D}${nonarch_base_libdir}/firmware/brcm/
+ install -m 0644 debian/firmware/broadcom/BCM434*.hcd ${D}${nonarch_base_libdir}/firmware/brcm/
}
PACKAGES = "\
${PN}-cypress-license \
${PN}-bcm43430a1-hcd \
${PN}-bcm43430b0-hcd \
+ ${PN}-bcm4343a2-hcd \
${PN}-bcm4345c0-hcd \
${PN}-bcm4345c5-hcd \
"
-LICENSE_${PN}-bcm43430a1-hcd = "Firmware-cypress-rpidistro"
-LICENSE_${PN}-bcm43430b0-hcd = "Firmware-cypress-rpidistro"
-LICENSE_${PN}-bcm4345c0-hcd = "Firmware-cypress-rpidistro"
-LICENSE_${PN}-bcm4345c5-hcd = "Firmware-cypress-rpidistro"
-LICENSE_${PN}-cypress-license = "Firmware-cypress-rpidistro"
+LICENSE:${PN}-bcm43430a1-hcd = "Firmware-cypress-rpidistro"
+LICENSE:${PN}-bcm43430b0-hcd = "Firmware-cypress-rpidistro"
+LICENSE:${PN}-bcm4343a2-hcd = "Firmware-cypress-rpidistro"
+LICENSE:${PN}-bcm4345c0-hcd = "Firmware-cypress-rpidistro"
+LICENSE:${PN}-bcm4345c5-hcd = "Firmware-cypress-rpidistro"
+LICENSE:${PN}-cypress-license = "Firmware-cypress-rpidistro"
-FILES_${PN}-cypress-license = "\
+FILES:${PN}-cypress-license = "\
${nonarch_base_libdir}/firmware/LICENCE.cypress-rpidistro \
"
-FILES_${PN}-bcm43430a1-hcd = "\
+FILES:${PN}-bcm43430a1-hcd = "\
${nonarch_base_libdir}/firmware/brcm/BCM43430A1.hcd \
"
-FILES_${PN}-bcm43430b0-hcd = "\
+FILES:${PN}-bcm43430b0-hcd = "\
${nonarch_base_libdir}/firmware/brcm/BCM43430B0.hcd \
"
-FILES_${PN}-bcm4345c0-hcd = "\
+FILES:${PN}-bcm4343a2-hcd = "\
+ ${nonarch_base_libdir}/firmware/brcm/BCM4343A2.hcd \
+"
+FILES:${PN}-bcm4345c0-hcd = "\
${nonarch_base_libdir}/firmware/brcm/BCM4345C0.hcd \
"
-FILES_${PN}-bcm4345c5-hcd = "\
+FILES:${PN}-bcm4345c5-hcd = "\
${nonarch_base_libdir}/firmware/brcm/BCM4345C5.hcd \
"
-RDEPENDS_${PN}-bcm43430a1-hcd += "${PN}-cypress-license"
-RDEPENDS_${PN}-bcm43430b0-hcd += "${PN}-cypress-license"
-RDEPENDS_${PN}-bcm4345c0-hcd += "${PN}-cypress-license"
-RDEPENDS_${PN}-bcm4345c5-hcd += "${PN}-cypress-license"
-RCONFLICTS_${PN}-bcm43430a1-hcd = "linux-firmware-bcm43430a1-hcd"
-RREPLACES_${PN}-bcm43430a1-hcd = "linux-firmware-bcm43430a1-hcd"
-RCONFLICTS_${PN}-bcm43430b0-hcd = "linux-firmware-bcm43430b0-hcd"
-RREPLACES_${PN}-bcm43430b0-hcd = "linux-firmware-bcm43430b0-hcd"
-RCONFLICTS_${PN}-bcm43435c0-hcd = "linux-firmware-bcm4345c0-hcd"
-RREPLACES_${PN}-bcm43435c0-hcd = "linux-firmware-bcm4345c0-hcd"
-RCONFLICTS_${PN}-bcm43435c5-hcd = "linux-firmware-bcm4345c5-hcd"
-RREPLACES_${PN}-bcm43435c5-hcd = "linux-firmware-bcm4345c5-hcd"
+RDEPENDS:${PN}-bcm43430a1-hcd += "${PN}-cypress-license"
+RDEPENDS:${PN}-bcm43430b0-hcd += "${PN}-cypress-license"
+RDEPENDS:${PN}-bcm4343a2-hcd += "${PN}-cypress-license"
+RDEPENDS:${PN}-bcm4345c0-hcd += "${PN}-cypress-license"
+RDEPENDS:${PN}-bcm4345c5-hcd += "${PN}-cypress-license"
+RCONFLICTS:${PN}-bcm43430a1-hcd = "linux-firmware-bcm43430a1-hcd"
+RREPLACES:${PN}-bcm43430a1-hcd = "linux-firmware-bcm43430a1-hcd"
+RCONFLICTS:${PN}-bcm43430b0-hcd = "linux-firmware-bcm43430b0-hcd"
+RREPLACES:${PN}-bcm43430b0-hcd = "linux-firmware-bcm43430b0-hcd"
+RCONFLICTS:${PN}-bcm4343a2-hcd = "linux-firmware-bcm4343a2-hcd"
+RREPLACES:${PN}-bcm4343a2-hcd = "linux-firmware-bcm4343a2-hcd"
+RCONFLICTS:${PN}-bcm43435c0-hcd = "linux-firmware-bcm4345c0-hcd"
+RREPLACES:${PN}-bcm43435c0-hcd = "linux-firmware-bcm4345c0-hcd"
+RCONFLICTS:${PN}-bcm43435c5-hcd = "linux-firmware-bcm4345c5-hcd"
+RREPLACES:${PN}-bcm43435c5-hcd = "linux-firmware-bcm4345c5-hcd"
# Firmware files are generally not run on the CPU, so they can be
# allarch despite being architecture specific
diff --git a/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro/0001-Default-43455-firmware-to-standard-variant.patch b/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro/0001-Default-43455-firmware-to-standard-variant.patch
new file mode 100644
index 0000000..f67d95b
--- /dev/null
+++ b/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro/0001-Default-43455-firmware-to-standard-variant.patch
@@ -0,0 +1,28 @@
+From b9db43e36ad0942d33cb4db5b394abd722862568 Mon Sep 17 00:00:00 2001
+From: Andrei Gherzan <andrei.gherzan@huawei.com>
+Date: Fri, 9 Sep 2022 20:28:06 +0200
+Subject: [PATCH] Default 43455 firmware to standard variant
+
+The firmware for 43455 is loaded as a symlink: brcmfmac43455-sdio.bin.
+This symlink is now broken as the debian package handles the right
+target of this symlink through a postinstall. We don't have that logic
+here so we default to the standard variant.
+
+Upstream-Status: Inappropriate [issue reported at https://github.com/RPi-Distro/firmware-nonfree/issues/26]
+Signed-off-by: Andrei Gherzan <andrei.gherzan@huawei.com>
+---
+ debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin b/debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin
+index 9c39208..b914838 120000
+--- a/debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin
++++ b/debian/config/brcm80211/brcm/brcmfmac43455-sdio.bin
+@@ -1 +1 @@
+-../cypress/cyfmac43455-sdio.bin
+\ No newline at end of file
++../cypress/cyfmac43455-sdio-standard.bin
+\ No newline at end of file
+--
+2.25.1
+
diff --git a/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb b/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb
index a091585..959513d 100644
--- a/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb
+++ b/recipes-kernel/linux-firmware-rpidistro/linux-firmware-rpidistro_git.bb
@@ -5,111 +5,103 @@ to linux-firmware for general use."
HOMEPAGE = "https://github.com/RPi-Distro/firmware-nonfree"
SECTION = "kernel"
-# In maintained upstream linux-firmware:
-# * brcmfmac43430-sdio falls under LICENCE.cypress
-# * brcmfmac43455-sdio falls under LICENCE.broadcom_bcm43xx
-# * brcmfmac43456-sdio falls under LICENCE.broadcom_bcm43xx
-#
-# It is likely[^1] that both of these should be under LICENCE.cypress.
-# Further, at this time the text of LICENCE.broadcom_bcm43xx is the same
-# in linux-firmware and RPi-Distro/firmware-nonfree, but this may
-# change.
-#
-# Rather than make assumptions about what's supposed to be what, we'll
-# use the license implied by the source of these files, named to avoid
-# conflicts with linux-firmware.
-#
-# [^1]: https://github.com/RPi-Distro/bluez-firmware/issues/1
-LICENSE = "\
- Firmware-broadcom_bcm43xx-rpidistro \
- & WHENCE \
-"
+LICENSE = "GPL-2.0-only & binary-redist-Cypress-rpidistro & Synaptics-rpidistro"
LIC_FILES_CHKSUM = "\
- file://LICENCE.broadcom_bcm43xx;md5=3160c14df7228891b868060e1951dfbc \
- file://WHENCE;md5=7b12b2224438186e4c97c4c7f3a5cc28 \
+ file://debian/copyright;md5=291ee5385b4cf74b10c5fb5a46a7bbc6 \
"
+# Where these are no common licenses, set NO_GENERIC_LICENSE so that the
+# license files will be copied from the fetched source.
+NO_GENERIC_LICENSE[binary-redist-Cypress-rpidistro] = "debian/copyright"
+NO_GENERIC_LICENSE[Synaptics-rpidistro] = "debian/copyright"
+LICENSE_FLAGS = "synaptics-killswitch"
-# These are not common licenses, set NO_GENERIC_LICENSE for them
-# so that the license files will be copied from fetched source
-NO_GENERIC_LICENSE[Firmware-broadcom_bcm43xx-rpidistro] = "LICENCE.broadcom_bcm43xx"
-NO_GENERIC_LICENSE[WHENCE] = "WHENCE"
-
-SRC_URI = "git://github.com/RPi-Distro/firmware-nonfree"
-
-SRCREV = "83938f78ca2d5a0ffe0c223bb96d72ccc7b71ca5"
-PV = "20190114-1+rpt11"
-
+SRC_URI = "git://github.com/RPi-Distro/firmware-nonfree;branch=bookworm;protocol=https \
+ file://0001-Default-43455-firmware-to-standard-variant.patch \
+"
+SRCREV = "223ccf3a3ddb11b3ea829749fbbba4d65b380897"
+PV = "20230625-2+rpt2"
S = "${WORKDIR}/git"
inherit allarch
-CLEANBROKEN = "1"
-
-do_compile() {
- :
-}
+do_configure[noexec] = "1"
+do_compile[noexec] = "1"
do_install() {
- install -d ${D}${nonarch_base_libdir}/firmware/brcm
+ install -d ${D}${nonarch_base_libdir}/firmware/brcm ${D}${nonarch_base_libdir}/firmware/cypress
- cp ./LICENCE.broadcom_bcm43xx ${D}${nonarch_base_libdir}/firmware/LICENCE.broadcom_bcm43xx-rpidistro
+ cp debian/copyright ${D}${nonarch_base_libdir}/firmware/copyright.firmware-nonfree-rpidistro
- # Replace outdated linux-firmware files with updated ones from
- # raspbian firmware-nonfree. Raspbian adds blobs and nvram
- # definitions that are also necessary so copy those too.
- for fw in brcmfmac43430-sdio brcmfmac43455-sdio brcmfmac43456-sdio ; do
- install -m 0644 brcm/${fw}.* ${D}${nonarch_base_libdir}/firmware/brcm/
+ for fw in \
+ brcmfmac43430-sdio \
+ brcmfmac43436-sdio \
+ brcmfmac43436s-sdio \
+ brcmfmac43455-sdio \
+ brcmfmac43456-sdio; do
+ cp -R --no-dereference --preserve=mode,links -v debian/config/brcm80211/brcm/${fw}.* ${D}${nonarch_base_libdir}/firmware/brcm/
done
- # add compat links. Fixes errors like
- # brcmfmac mmc1:0001:1: Direct firmware load for brcm/brcmfmac43455-sdio.raspberrypi,4-model-b.txt failed with error -2
- ln -s brcmfmac43455-sdio.txt ${D}${nonarch_base_libdir}/firmware/brcm/brcmfmac43455-sdio.raspberrypi,4-model-b.txt
- ln -s brcmfmac43455-sdio.txt ${D}${nonarch_base_libdir}/firmware/brcm/brcmfmac43455-sdio.raspberrypi,3-model-b-plus.txt
- ln -s brcmfmac43430-sdio.txt ${D}${nonarch_base_libdir}/firmware/brcm/brcmfmac43430-sdio.raspberrypi,3-model-b.txt
- ln -s brcmfmac43430-sdio.txt ${D}${nonarch_base_libdir}/firmware/brcm/brcmfmac43430-sdio.raspberrypi,model-zero-w.txt
+
+ cp -R --no-dereference --preserve=mode,links -v debian/config/brcm80211/cypress/* ${D}${nonarch_base_libdir}/firmware/cypress/
+
+ rm ${D}${nonarch_base_libdir}/firmware/cypress/README.txt
}
PACKAGES = "\
- ${PN}-broadcom-license \
${PN}-bcm43430 \
+ ${PN}-bcm43436 \
+ ${PN}-bcm43436s \
+ ${PN}-bcm43439 \
${PN}-bcm43455 \
${PN}-bcm43456 \
+ ${PN}-license \
"
-LICENSE_${PN}-bcm43430 = "Firmware-broadcom_bcm43xx-rpidistro"
-LICENSE_${PN}-bcm43455 = "Firmware-broadcom_bcm43xx-rpidistro"
-LICENSE_${PN}-bcm43456 = "Firmware-broadcom_bcm43xx-rpidistro"
-LICENSE_${PN}-broadcom-license = "Firmware-broadcom_bcm43xx-rpidistro"
-FILES_${PN}-broadcom-license = "${nonarch_base_libdir}/firmware/LICENCE.broadcom_bcm43xx-rpidistro"
-FILES_${PN}-bcm43430 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43430*"
-FILES_${PN}-bcm43455 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43455*"
-FILES_${PN}-bcm43456 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43456*"
-RDEPENDS_${PN}-bcm43430 += "${PN}-broadcom-license"
-RDEPENDS_${PN}-bcm43455 += "${PN}-broadcom-license"
-RDEPENDS_${PN}-bcm43456 += "${PN}-broadcom-license"
-RCONFLICTS_${PN}-bcm43430 = "\
- linux-firmware-bcm43430 \
- linux-firmware-raspbian-bcm43430 \
-"
-RREPLACES_${PN}-bcm43430 = "\
- linux-firmware-bcm43430 \
- linux-firmware-raspbian-bcm43430 \
-"
-RCONFLICTS_${PN}-bcm43455 = "\
- linux-firmware-bcm43455 \
- linux-firmware-raspbian-bcm43455 \
-"
-RREPLACES_${PN}-bcm43455 = "\
- linux-firmware-bcm43455 \
- linux-firmware-raspbian-bcm43455 \
+LICENSE:${PN}-bcm43430 = "binary-redist-Cypress-rpidistro"
+LICENSE:${PN}-bcm43436 = "Synaptics-rpidistro"
+LICENSE:${PN}-bcm43436s = "Synaptics-rpidistro"
+LICENSE:${PN}-bcm43439 = "Synaptics-rpidistro"
+LICENSE:${PN}-bcm43455 = "binary-redist-Cypress-rpidistro"
+LICENSE:${PN}-bcm43456 = "Synaptics-rpidistro"
+LICENSE:${PN}-license = "GPL-2.0-only"
+
+FILES:${PN}-bcm43430 = " \
+ ${nonarch_base_libdir}/firmware/brcm/brcmfmac43430* \
+ ${nonarch_base_libdir}/firmware/cypress/cyfmac43430-sdio.bin \
+ ${nonarch_base_libdir}/firmware/cypress/cyfmac43430-sdio.clm_blob \
"
-RCONFLICTS_${PN}-bcm43456 = "\
- linux-firmware-bcm43456 \
- linux-firmware-raspbian-bcm43456 \
+FILES:${PN}-bcm43436 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43436-*"
+FILES:${PN}-bcm43436s = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43436s*"
+FILES:${PN}-bcm43439 = " \
+ ${nonarch_base_libdir}/firmware/cypress/43439A0-7.95.49.00.combined \
+ ${nonarch_base_libdir}/firmware/cypress/cyfmac43439-sdio* \
"
-RREPLACES_${PN}-bcm43456 = "\
- linux-firmware-bcm43456 \
- linux-firmware-raspbian-bcm43456 \
+FILES:${PN}-bcm43455 = " \
+ ${nonarch_base_libdir}/firmware/brcm/brcmfmac43455* \
+ ${nonarch_base_libdir}/firmware/cypress/cyfmac43455-sdio* \
"
+FILES:${PN}-bcm43456 = "${nonarch_base_libdir}/firmware/brcm/brcmfmac43456*"
+FILES:${PN}-license = "${nonarch_base_libdir}/firmware/copyright.firmware-nonfree-rpidistro"
+
+RDEPENDS:${PN}-bcm43430 += "${PN}-license"
+RDEPENDS:${PN}-bcm43436 += "${PN}-license"
+RDEPENDS:${PN}-bcm43436s += "${PN}-license"
+RDEPENDS:${PN}-bcm43439 += "${PN}-license"
+RDEPENDS:${PN}-bcm43455 += "${PN}-license"
+RDEPENDS:${PN}-bcm43456 += "${PN}-license"
+
+RCONFLICTS:${PN}-bcm43430 = "linux-firmware-raspbian-bcm43430"
+RCONFLICTS:${PN}-bcm43436 = "linux-firmware-bcm43436"
+RCONFLICTS:${PN}-bcm43436s = "linux-firmware-bcm43436s"
+RCONFLICTS:${PN}-bcm43439 = "linux-firmware-bcm43439"
+RCONFLICTS:${PN}-bcm43455 = "linux-firmware-bcm43455"
+RCONFLICTS:${PN}-bcm43456 = "linux-firmware-bcm43456"
+
+RREPLACES:${PN}-bcm43430 = "linux-firmware-bcm43430"
+RREPLACES:${PN}-bcm43436 = "linux-firmware-bcm43436"
+RREPLACES:${PN}-bcm43436s = "linux-firmware-bcm43436s"
+RREPLACES:${PN}-bcm43439 = "linux-firmware-bcm43439"
+RREPLACES:${PN}-bcm43455 = "linux-firmware-bcm43455"
+RREPLACES:${PN}-bcm43456 = "linux-firmware-bcm43456"
# Firmware files are generally not run on the CPU, so they can be
# allarch despite being architecture specific
diff --git a/recipes-kernel/linux/files/0001-Revert-selftests-bpf-Skip-perf-hw-events-test-if-the.patch b/recipes-kernel/linux/files/0001-Revert-selftests-bpf-Skip-perf-hw-events-test-if-the.patch
deleted file mode 100644
index 66efde1..0000000
--- a/recipes-kernel/linux/files/0001-Revert-selftests-bpf-Skip-perf-hw-events-test-if-the.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-From 754e3030788702c1f013a88a4fc8546742d84e27 Mon Sep 17 00:00:00 2001
-From: Khem Raj <raj.khem@gmail.com>
-Date: Thu, 18 Jun 2020 13:45:04 -0700
-Subject: [PATCH] Revert "selftests/bpf: Skip perf hw events test if the setup
- disabled it"
-
-This reverts commit da43712a7262891317883d4b3a909fb18dac4b1d.
-
-Signed-off-by: Khem Raj <raj.khem@gmail.com>
----
- .../selftests/bpf/prog_tests/stacktrace_build_id_nmi.c | 8 ++------
- 1 file changed, 2 insertions(+), 6 deletions(-)
-
-diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
-index 437cb93e72ac..f62aa0eb959b 100644
---- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
-+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
-@@ -49,12 +49,8 @@ void test_stacktrace_build_id_nmi(void)
- pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
- 0 /* cpu 0 */, -1 /* group id */,
- 0 /* flags */);
-- if (pmu_fd < 0 && errno == ENOENT) {
-- printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__);
-- test__skip();
-- goto close_prog;
-- }
-- if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
-+ if (CHECK(pmu_fd < 0, "perf_event_open",
-+ "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
- pmu_fd, errno))
- goto close_prog;
-
---
-2.27.0
-
diff --git a/recipes-kernel/linux/files/0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch b/recipes-kernel/linux/files/0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch
new file mode 100644
index 0000000..4f64687
--- /dev/null
+++ b/recipes-kernel/linux/files/0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch
@@ -0,0 +1,50 @@
+From 32f53700aeef2f5c7797ddda66348fc0b29e1047 Mon Sep 17 00:00:00 2001
+From: Kees Cook <keescook@chromium.org>
+Date: Wed, 18 Jan 2023 12:21:35 -0800
+Subject: [PATCH] gcc-plugins: Reorganize gimple includes for GCC 13
+
+The gimple-iterator.h header must be included before gimple-fold.h
+starting with GCC 13. Reorganize gimple headers to work for all GCC
+versions.
+
+Reported-by: Palmer Dabbelt <palmer@rivosinc.com>
+Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
+Link: https://lore.kernel.org/all/20230113173033.4380-1-palmer@rivosinc.com/
+Cc: linux-hardening@vger.kernel.org
+Signed-off-by: Kees Cook <keescook@chromium.org>
+---
+Upstream-Status: Pending
+
+ scripts/gcc-plugins/gcc-common.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h
+index 0c087614fc3e..27770c31214c 100644
+--- a/scripts/gcc-plugins/gcc-common.h
++++ b/scripts/gcc-plugins/gcc-common.h
+@@ -77,8 +77,10 @@
+ #include "varasm.h"
+ #include "stor-layout.h"
+ #include "internal-fn.h"
++#include "gimple.h"
+ #include "gimple-expr.h"
+ #include "gimple-fold.h"
++#include "gimple-iterator.h"
+ #include "context.h"
+ #include "tree-ssa-alias.h"
+ #include "tree-ssa.h"
+@@ -91,11 +93,9 @@
+ #include "tree-eh.h"
+ #include "stmt.h"
+ #include "gimplify.h"
+-#include "gimple.h"
+ #include "tree-ssa-operands.h"
+ #include "tree-phinodes.h"
+ #include "tree-cfg.h"
+-#include "gimple-iterator.h"
+ #include "gimple-ssa.h"
+ #include "ssa-iterators.h"
+
+--
+2.39.1
+
diff --git a/recipes-kernel/linux/files/0002-Revert-selftests-bpf-Fix-perf_buffer-test-on-systems.patch b/recipes-kernel/linux/files/0002-Revert-selftests-bpf-Fix-perf_buffer-test-on-systems.patch
deleted file mode 100644
index d18b942..0000000
--- a/recipes-kernel/linux/files/0002-Revert-selftests-bpf-Fix-perf_buffer-test-on-systems.patch
+++ /dev/null
@@ -1,94 +0,0 @@
-From 366487b86a8c87954fb4ab7bd88ab49a929a32f6 Mon Sep 17 00:00:00 2001
-From: Khem Raj <raj.khem@gmail.com>
-Date: Mon, 13 Apr 2020 11:25:58 -0700
-Subject: [PATCH 2/2] Revert "selftests/bpf: Fix perf_buffer test on systems w/
- offline CPUs"
-
-This reverts commit 77bb53cb094828a31cd3c5b402899810f63073c1.
----
- .../selftests/bpf/prog_tests/perf_buffer.c | 29 ++++---------------
- 1 file changed, 5 insertions(+), 24 deletions(-)
-
-diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
-index cf6c87936c69..3003fddc0613 100644
---- a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
-+++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
-@@ -4,7 +4,6 @@
- #include <sched.h>
- #include <sys/socket.h>
- #include <test_progs.h>
--#include "libbpf_internal.h"
-
- static void on_sample(void *ctx, int cpu, void *data, __u32 size)
- {
-@@ -20,7 +19,7 @@ static void on_sample(void *ctx, int cpu, void *data, __u32 size)
-
- void test_perf_buffer(void)
- {
-- int err, prog_fd, on_len, nr_on_cpus = 0, nr_cpus, i, duration = 0;
-+ int err, prog_fd, nr_cpus, i, duration = 0;
- const char *prog_name = "kprobe/sys_nanosleep";
- const char *file = "./test_perf_buffer.o";
- struct perf_buffer_opts pb_opts = {};
-@@ -30,27 +29,15 @@ void test_perf_buffer(void)
- struct bpf_object *obj;
- struct perf_buffer *pb;
- struct bpf_link *link;
-- bool *online;
-
- nr_cpus = libbpf_num_possible_cpus();
- if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus))
- return;
-
-- err = parse_cpu_mask_file("/sys/devices/system/cpu/online",
-- &online, &on_len);
-- if (CHECK(err, "nr_on_cpus", "err %d\n", err))
-- return;
--
-- for (i = 0; i < on_len; i++)
-- if (online[i])
-- nr_on_cpus++;
--
- /* load program */
- err = bpf_prog_load(file, BPF_PROG_TYPE_KPROBE, &obj, &prog_fd);
-- if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno)) {
-- obj = NULL;
-- goto out_close;
-- }
-+ if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
-+ return;
-
- prog = bpf_object__find_program_by_title(obj, prog_name);
- if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
-@@ -77,11 +64,6 @@ void test_perf_buffer(void)
- /* trigger kprobe on every CPU */
- CPU_ZERO(&cpu_seen);
- for (i = 0; i < nr_cpus; i++) {
-- if (i >= on_len || !online[i]) {
-- printf("skipping offline CPU #%d\n", i);
-- continue;
-- }
--
- CPU_ZERO(&cpu_set);
- CPU_SET(i, &cpu_set);
-
-@@ -99,8 +81,8 @@ void test_perf_buffer(void)
- if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
- goto out_free_pb;
-
-- if (CHECK(CPU_COUNT(&cpu_seen) != nr_on_cpus, "seen_cpu_cnt",
-- "expect %d, seen %d\n", nr_on_cpus, CPU_COUNT(&cpu_seen)))
-+ if (CHECK(CPU_COUNT(&cpu_seen) != nr_cpus, "seen_cpu_cnt",
-+ "expect %d, seen %d\n", nr_cpus, CPU_COUNT(&cpu_seen)))
- goto out_free_pb;
-
- out_free_pb:
-@@ -109,5 +91,4 @@ void test_perf_buffer(void)
- bpf_link__destroy(link);
- out_close:
- bpf_object__close(obj);
-- free(online);
- }
---
-2.26.0
-
diff --git a/recipes-kernel/linux/files/default-cpu-governor.cfg b/recipes-kernel/linux/files/default-cpu-governor.cfg
new file mode 100644
index 0000000..e2e201d
--- /dev/null
+++ b/recipes-kernel/linux/files/default-cpu-governor.cfg
@@ -0,0 +1,9 @@
+# The defconfigs from the RPi Kernel set "powersave" as the default CPU governor.
+# That is a bad idea as it reduces performance, so we unset that default option here.
+# The option to build the powersave governor (but not as the default) is also enabled.
+# A fix for this was sent to upstream: https://github.com/raspberrypi/linux/pull/5666
+# However, we need to carry this option override until those defconfigs are fixed on
+# *all* the kernel branches that we support. So that can be a long time depending
+# on wheter the above PR gets accepted and/or backported to the stable branches.
+CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE=n
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
diff --git a/recipes-kernel/linux/files/raspberrypi4/rpi4-nvmem.cfg b/recipes-kernel/linux/files/raspberrypi4/rpi4-nvmem.cfg
new file mode 100644
index 0000000..0248162
--- /dev/null
+++ b/recipes-kernel/linux/files/raspberrypi4/rpi4-nvmem.cfg
@@ -0,0 +1 @@
+CONFIG_NVMEM_RMEM=y
diff --git a/recipes-kernel/linux/files/rpi.scc b/recipes-kernel/linux/files/rpi.scc
new file mode 100644
index 0000000..bb6fffd
--- /dev/null
+++ b/recipes-kernel/linux/files/rpi.scc
@@ -0,0 +1 @@
+patch 0001-gcc-plugins-Reorganize-gimple-includes-for-GCC-13.patch
diff --git a/recipes-kernel/linux/files/wm8960.cfg b/recipes-kernel/linux/files/wm8960.cfg
new file mode 100644
index 0000000..9360800
--- /dev/null
+++ b/recipes-kernel/linux/files/wm8960.cfg
@@ -0,0 +1,2 @@
+CONFIG_I2C_BCM2835=y
+CONFIG_SND_SOC_WM8960=y
diff --git a/recipes-kernel/linux/linux-raspberrypi-dev.bb b/recipes-kernel/linux/linux-raspberrypi-dev.bb
index 0dfa451..e0a4461 100644
--- a/recipes-kernel/linux/linux-raspberrypi-dev.bb
+++ b/recipes-kernel/linux/linux-raspberrypi-dev.bb
@@ -20,7 +20,7 @@ SRCREV_meta ?= '${@oe.utils.conditional("PREFERRED_PROVIDER_virtual/kernel", "li
KMETA = "kernel-meta"
SRC_URI = " \
- git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH} \
+ git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH};protocol=https \
git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
file://powersave.cfg \
file://android-drivers.cfg \
diff --git a/recipes-kernel/linux/linux-raspberrypi-v7.inc b/recipes-kernel/linux/linux-raspberrypi-v7.inc
new file mode 100644
index 0000000..77debc4
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi-v7.inc
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+KBUILD_DEFCONFIG:raspberrypi-armv7 = "bcm2709_defconfig"
+KERNEL_PACKAGE_NAME = "${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}"
+PROVIDES:remove = "virtual/kernel"
+
+KERNEL_IMAGETYPE_DIRECT ?= "zImage"
+
+COMPATIBLE_MACHINE = "^raspberrypi-armv7$"
+
+KERNEL_DEVICETREE = ""
diff --git a/recipes-kernel/linux/linux-raspberrypi-v7_5.15.bb b/recipes-kernel/linux/linux-raspberrypi-v7_5.15.bb
new file mode 100644
index 0000000..7883985
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi-v7_5.15.bb
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+require linux-raspberrypi-v7.inc
+require linux-raspberrypi_5.15.bb
diff --git a/recipes-kernel/linux/linux-raspberrypi-v7_6.1.bb b/recipes-kernel/linux/linux-raspberrypi-v7_6.1.bb
new file mode 100644
index 0000000..ef77b0b
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi-v7_6.1.bb
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+require linux-raspberrypi-v7.inc
+require linux-raspberrypi_6.1.bb
diff --git a/recipes-kernel/linux/linux-raspberrypi-v7_6.6.bb b/recipes-kernel/linux/linux-raspberrypi-v7_6.6.bb
new file mode 100644
index 0000000..a5695f6
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi-v7_6.6.bb
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Andrei Gherzan <andrei.gherzan@huawei.com>
+#
+# SPDX-License-Identifier: MIT
+
+require linux-raspberrypi-v7.inc
+require linux-raspberrypi_6.6.bb
diff --git a/recipes-kernel/linux/linux-raspberrypi.inc b/recipes-kernel/linux/linux-raspberrypi.inc
index 5ad9b78..e62ff3f 100644
--- a/recipes-kernel/linux/linux-raspberrypi.inc
+++ b/recipes-kernel/linux/linux-raspberrypi.inc
@@ -1,9 +1,9 @@
DESCRIPTION = "Linux Kernel for Raspberry Pi"
SECTION = "kernel"
-LICENSE = "GPLv2"
+LICENSE = "GPL-2.0-only"
LIC_FILES_CHKSUM = "file://COPYING;md5=6bc538ed5bd9a7fc9398086aedcd7e46"
-COMPATIBLE_MACHINE = "^rpi$"
+COMPATIBLE_MACHINE ?= "^rpi$"
PE = "1"
PV = "${LINUX_VERSION}+git${SRCPV}"
@@ -14,17 +14,26 @@ require recipes-kernel/linux/linux-yocto.inc
SRC_URI += " \
${@bb.utils.contains("INITRAMFS_IMAGE_BUNDLE", "1", "file://initramfs-image-bundle.cfg", "", d)} \
${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", "file://vc4graphics.cfg", "", d)} \
+ ${@bb.utils.contains("MACHINE_FEATURES", "wm8960", "file://wm8960.cfg", "", d)} \
+ file://default-cpu-governor.cfg \
"
+SRC_URI:append:raspberrypi4 = " \
+ file://rpi4-nvmem.cfg \
+"
+
KCONFIG_MODE = "--alldefconfig"
-KBUILD_DEFCONFIG_raspberrypi0-wifi ?= "bcmrpi_defconfig"
-KBUILD_DEFCONFIG_raspberrypi ?= "bcmrpi_defconfig"
-KBUILD_DEFCONFIG_raspberrypi-cm3 ?= "bcm2709_defconfig"
-KBUILD_DEFCONFIG_raspberrypi2 ?= "bcm2709_defconfig"
-KBUILD_DEFCONFIG_raspberrypi3 ?= "bcm2709_defconfig"
-KBUILD_DEFCONFIG_raspberrypi3-64 ?= "bcmrpi3_defconfig"
-KBUILD_DEFCONFIG_raspberrypi4 ?= "bcm2711_defconfig"
-KBUILD_DEFCONFIG_raspberrypi4-64 ?= "bcm2711_defconfig"
+KBUILD_DEFCONFIG:raspberrypi0-wifi ?= "bcmrpi_defconfig"
+KBUILD_DEFCONFIG:raspberrypi ?= "bcmrpi_defconfig"
+KBUILD_DEFCONFIG:raspberrypi-cm3 ?= "bcm2709_defconfig"
+KBUILD_DEFCONFIG:raspberrypi2 ?= "bcm2709_defconfig"
+KBUILD_DEFCONFIG:raspberrypi3 ?= "bcm2709_defconfig"
+KBUILD_DEFCONFIG:raspberrypi3-64 ?= "bcmrpi3_defconfig"
+KBUILD_DEFCONFIG:raspberrypi4 ?= "bcm2711_defconfig"
+KBUILD_DEFCONFIG:raspberrypi4-64 ?= "bcm2711_defconfig"
+KBUILD_DEFCONFIG:raspberrypi-armv7 ?= "bcm2711_defconfig"
+KBUILD_DEFCONFIG:raspberrypi-armv8 ?= "bcm2711_defconfig"
+KBUILD_DEFCONFIG:raspberrypi5 ?= "bcm2712_defconfig"
LINUX_VERSION_EXTENSION ?= ""
@@ -33,4 +42,7 @@ KERNEL_MODULE_AUTOLOAD += "${@bb.utils.contains("MACHINE_FEATURES", "pitft28r",
# A LOADADDR is needed when building a uImage format kernel. This value is not
# set by default in rpi-4.8.y and later branches so we need to provide it
# manually. This value unused if KERNEL_IMAGETYPE is not uImage.
-KERNEL_EXTRA_ARGS += "LOADADDR=0x00008000"
+KERNEL_EXTRA_ARGS += "LOADADDR=${UBOOT_ENTRYPOINT}"
+
+UBOOT_ENTRYPOINT = "0x00008000"
+UBOOT_LOADADDRESS = "0x00008000"
diff --git a/recipes-kernel/linux/linux-raspberrypi_5.10.bb b/recipes-kernel/linux/linux-raspberrypi_5.10.bb
deleted file mode 100644
index fc33559..0000000
--- a/recipes-kernel/linux/linux-raspberrypi_5.10.bb
+++ /dev/null
@@ -1,19 +0,0 @@
-LINUX_VERSION ?= "5.10.31"
-LINUX_RPI_BRANCH ?= "rpi-5.10.y"
-LINUX_RPI_KMETA_BRANCH ?= "yocto-5.10"
-
-SRCREV_machine = "89399e6e7e33d6260a954603ca03857df594ffd3"
-SRCREV_meta = "a19886b00ea7d874fdd60d8e3435894bb16e6434"
-
-KMETA = "kernel-meta"
-
-SRC_URI = " \
- git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH} \
- git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
- file://powersave.cfg \
- file://android-drivers.cfg \
- "
-
-require linux-raspberrypi.inc
-
-KERNEL_DTC_FLAGS += "-@ -H epapr"
diff --git a/recipes-kernel/linux/linux-raspberrypi_5.15.bb b/recipes-kernel/linux/linux-raspberrypi_5.15.bb
new file mode 100644
index 0000000..3f167bb
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi_5.15.bb
@@ -0,0 +1,32 @@
+LINUX_VERSION ?= "5.15.92"
+LINUX_RPI_BRANCH ?= "rpi-5.15.y"
+LINUX_RPI_KMETA_BRANCH ?= "yocto-5.15"
+
+SRCREV_machine = "14b35093ca68bf2c81bbc90aace5007142b40b40"
+SRCREV_meta = "509f4b9d68337f103633d48b621c1c9aa0dc975d"
+
+KMETA = "kernel-meta"
+
+SRC_URI = " \
+ git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH};protocol=https \
+ git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
+ file://rpi.scc \
+ file://powersave.cfg \
+ file://android-drivers.cfg \
+ "
+
+require linux-raspberrypi.inc
+
+KERNEL_DTC_FLAGS += "-@ -H epapr"
+
+RDEPENDS:${KERNEL_PACKAGE_NAME}:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-base:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-base"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-image:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-image"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dev:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dev"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-vmlinux:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-vmlinux"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-modules:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-modules"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dbg:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dbg"
+
+DEPLOYDEP = ""
+DEPLOYDEP:raspberrypi-armv7 = "${RASPBERRYPI_v7_KERNEL}:do_deploy"
+do_deploy[depends] += "${DEPLOYDEP}"
diff --git a/recipes-kernel/linux/linux-raspberrypi_5.4.bb b/recipes-kernel/linux/linux-raspberrypi_5.4.bb
deleted file mode 100644
index 3432283..0000000
--- a/recipes-kernel/linux/linux-raspberrypi_5.4.bb
+++ /dev/null
@@ -1,23 +0,0 @@
-LINUX_VERSION ?= "5.4.83"
-LINUX_RPI_BRANCH ?= "rpi-5.4.y"
-LINUX_RPI_KMETA_BRANCH ?= "yocto-5.4"
-
-SRCREV_machine = "08ae2dd9e7dc89c20bff823a3ef045de09bfd090"
-SRCREV_meta = "d676bf5ff7b7071e14f44498d2482c0a596f14cd"
-
-KMETA = "kernel-meta"
-
-SRC_URI = " \
- git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH} \
- git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
- file://0001-Revert-selftests-bpf-Skip-perf-hw-events-test-if-the.patch \
- file://0002-Revert-selftests-bpf-Fix-perf_buffer-test-on-systems.patch \
- file://powersave.cfg \
- file://android-drivers.cfg \
- "
-
-require linux-raspberrypi.inc
-
-LIC_FILES_CHKSUM = "file://COPYING;md5=bbea815ee2795b2f4230826c0c6b8814"
-
-KERNEL_DTC_FLAGS += "-@ -H epapr"
diff --git a/recipes-kernel/linux/linux-raspberrypi_6.1.bb b/recipes-kernel/linux/linux-raspberrypi_6.1.bb
new file mode 100644
index 0000000..5731a81
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi_6.1.bb
@@ -0,0 +1,31 @@
+LINUX_VERSION ?= "6.1.77"
+LINUX_RPI_BRANCH ?= "rpi-6.1.y"
+LINUX_RPI_KMETA_BRANCH ?= "yocto-6.1"
+
+SRCREV_machine = "77fc1fbcb5c013329af9583307dd1ff3cd4752aa"
+SRCREV_meta = "43d1723dbe0ce7b341cf32feeb35ecbe6b0ce29a"
+
+KMETA = "kernel-meta"
+
+SRC_URI = " \
+ git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH};protocol=https \
+ git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
+ file://powersave.cfg \
+ file://android-drivers.cfg \
+ "
+
+require linux-raspberrypi.inc
+
+KERNEL_DTC_FLAGS += "-@ -H epapr"
+
+RDEPENDS:${KERNEL_PACKAGE_NAME}:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-base:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-base"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-image:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-image"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dev:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dev"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-vmlinux:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-vmlinux"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-modules:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-modules"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dbg:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dbg"
+
+DEPLOYDEP = ""
+DEPLOYDEP:raspberrypi-armv7 = "${RASPBERRYPI_v7_KERNEL}:do_deploy"
+do_deploy[depends] += "${DEPLOYDEP}"
diff --git a/recipes-kernel/linux/linux-raspberrypi_6.6.bb b/recipes-kernel/linux/linux-raspberrypi_6.6.bb
new file mode 100644
index 0000000..b4d9953
--- /dev/null
+++ b/recipes-kernel/linux/linux-raspberrypi_6.6.bb
@@ -0,0 +1,31 @@
+LINUX_VERSION ?= "6.6.22"
+LINUX_RPI_BRANCH ?= "rpi-6.6.y"
+LINUX_RPI_KMETA_BRANCH ?= "yocto-6.6"
+
+SRCREV_machine = "c04af98514c26014a4f29ec87b3ece95626059bd"
+SRCREV_meta = "6a24861d6504575a4a9f92366285332d47c7e111"
+
+KMETA = "kernel-meta"
+
+SRC_URI = " \
+ git://github.com/raspberrypi/linux.git;name=machine;branch=${LINUX_RPI_BRANCH};protocol=https \
+ git://git.yoctoproject.org/yocto-kernel-cache;type=kmeta;name=meta;branch=${LINUX_RPI_KMETA_BRANCH};destsuffix=${KMETA} \
+ file://powersave.cfg \
+ file://android-drivers.cfg \
+ "
+
+require linux-raspberrypi.inc
+
+KERNEL_DTC_FLAGS += "-@ -H epapr"
+
+RDEPENDS:${KERNEL_PACKAGE_NAME}:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-base:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-base"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-image:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-image"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dev:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dev"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-vmlinux:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-vmlinux"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-modules:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-modules"
+RDEPENDS:${KERNEL_PACKAGE_NAME}-dbg:raspberrypi-armv7:append = " ${RASPBERRYPI_v7_KERNEL_PACKAGE_NAME}-dbg"
+
+DEPLOYDEP = ""
+DEPLOYDEP:raspberrypi-armv7 = "${RASPBERRYPI_v7_KERNEL}:do_deploy"
+do_deploy[depends] += "${DEPLOYDEP}"
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch
index c8af7da..5e206e5 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch
@@ -27,6 +27,8 @@ arrive:
gst_omx_component_wait_message()
---
+Upstream-Status: Pending
+
omx/gstomxvideodec.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch
index 4342326..db443e6 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0003-no-timeout-on-get-state.patch
@@ -3,6 +3,8 @@ From: Khem Raj <raj.khem@gmail.com>
Date: Sat, 13 Feb 2016 11:42:29 -0800
---
+Upstream-Status: Pending
+
omx/gstomxvideodec.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch
index 144ced6..c0ef99d 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0004-Properly-handle-drain-requests-while-flushing.patch
@@ -7,6 +7,8 @@ Without this commit the decoder streaming thread stops without ever attending
the drain request, leaving the decoder input thread waiting forever.
---
+Upstream-Status: Pending
+
omx/gstomx.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch
index 3245294..9914bb8 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-omx/0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch
@@ -11,7 +11,7 @@ Date: Fri, 4 Dec 2015 18:39:59 +0100
Subject: [PATCH] Don't abort gst_omx_video_dec_set_format() if there's a
timeout releasing the buffers taken by the egl_render out port
-Upstream-status: Pending
+Upstream-Status: Pending
Signed-off-by: Andrei Gherzan <andrei@gherzan.ro>
---
omx/gstomxvideodec.c | 2 ++
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-omx_%.bbappend b/recipes-multimedia/gstreamer/gstreamer1.0-omx_%.bbappend
index a971236..5e0b457 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-omx_%.bbappend
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-omx_%.bbappend
@@ -1,12 +1,12 @@
-FILESEXTRAPATHS_prepend_rpi := "${THISDIR}/${PN}:"
+FILESEXTRAPATHS:prepend:rpi := "${THISDIR}/${PN}:"
-SRC_URI_append_rpi = " \
+SRC_URI:append:rpi = " \
file://0001-Don-t-try-to-acquire-buffer-when-src-pad-isn-t-activ.patch \
file://0003-no-timeout-on-get-state.patch \
file://0004-Properly-handle-drain-requests-while-flushing.patch \
file://0005-Don-t-abort-gst_omx_video_dec_set_format-if-there-s-.patch \
"
-GSTREAMER_1_0_OMX_TARGET_rpi = "rpi"
-GSTREAMER_1_0_OMX_CORE_NAME_rpi = "${libdir}/libopenmaxil.so"
-EXTRA_OEMESON_append_rpi = " -Dheader_path=${STAGING_DIR_TARGET}/usr/include/IL"
+GSTREAMER_1_0_OMX_TARGET:rpi = "rpi"
+GSTREAMER_1_0_OMX_CORE_NAME:rpi = "${libdir}/libopenmaxil.so"
+EXTRA_OEMESON:append:rpi = " -Dheader_path=${STAGING_DIR_TARGET}/usr/include/IL"
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend
index 8ab1510..5b3f945 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-bad_%.bbappend
@@ -1,2 +1,2 @@
-PACKAGECONFIG_append_rpi = " hls libmms \
- ${@bb.utils.contains('LICENSE_FLAGS_WHITELIST', 'commercial', 'faad', '', d)}"
+PACKAGECONFIG:append:rpi = " hls \
+ ${@bb.utils.contains('LICENSE_FLAGS_ACCEPTED', 'commercial', 'faad', '', d)}"
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-base_%.bbappend b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-base_%.bbappend
index 6bed42a..834ccfb 100644
--- a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-base_%.bbappend
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-base_%.bbappend
@@ -1,6 +1,6 @@
# if using bcm driver enable dispmanx not when using VC4 driver
-PACKAGECONFIG_append_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' dispmanx', d)}"
-DEPENDS_append_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' userland', d)}"
+PACKAGECONFIG:append:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' dispmanx', d)}"
+DEPENDS:append:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' userland', d)}"
PACKAGECONFIG_GL_VC4GRAPHICS = "${@bb.utils.contains('DISTRO_FEATURES', 'opengl', 'gles2 egl', '', d)}"
-PACKAGECONFIG_GL_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '${PACKAGECONFIG_GL_VC4GRAPHICS}', 'egl gles2', d)}"
+PACKAGECONFIG_GL:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '${PACKAGECONFIG_GL_VC4GRAPHICS}', 'egl gles2', d)}"
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_%.bbappend b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_%.bbappend
new file mode 100644
index 0000000..f3fb144
--- /dev/null
+++ b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_%.bbappend
@@ -0,0 +1 @@
+PACKAGECONFIG:append:rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' rpi', d)}"
diff --git a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_1.18.%.bbappend b/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_1.18.%.bbappend
deleted file mode 100644
index 80a324f..0000000
--- a/recipes-multimedia/gstreamer/gstreamer1.0-plugins-good_1.18.%.bbappend
+++ /dev/null
@@ -1 +0,0 @@
-PACKAGECONFIG_append_rpi = "${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', ' rpi', d)}"
diff --git a/recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch b/recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch
index 37d0724..f65c421 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0001-Fix-build-with-vc4-driver.patch
@@ -5,6 +5,8 @@ Subject: [PATCH] Fix build with vc4 driver
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
SubtitleRenderer.cpp | 7 ++++++-
SubtitleRenderer.h | 1 +
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch b/recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch
index 82dfd3e..9e12bf3 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0001-Specify-cc-cxx-and-ld-variables-from-environment.patch
@@ -7,6 +7,8 @@ This helps in compiling with non-gcc compilers
Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
+Upstream-Status: Pending
+
Makefile.ffmpeg | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/recipes-multimedia/omxplayer/omxplayer/0002-Libraries-and-headers-from-ffmpeg-are-installed-in-u.patch b/recipes-multimedia/omxplayer/omxplayer/0002-Libraries-and-headers-from-ffmpeg-are-installed-in-u.patch
index dd1d4f3..e494650 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0002-Libraries-and-headers-from-ffmpeg-are-installed-in-u.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0002-Libraries-and-headers-from-ffmpeg-are-installed-in-u.patch
@@ -15,10 +15,8 @@ Signed-off-by: Jonathan Liu <net147@gmail.com>
Makefile.ffmpeg | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
-Index: git/Makefile
-===================================================================
---- git.orig/Makefile
-+++ git/Makefile
+--- a/Makefile
++++ b/Makefile
@@ -2,9 +2,9 @@ CFLAGS=-pipe -mfloat-abi=hard -mcpu=arm1
CFLAGS+=-std=c++0x -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS -DTARGET_POSIX -DTARGET_LINUX -fPIC -DPIC -D_REENTRANT -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_CMAKE_CONFIG -D__VIDEOCORE4__ -U_FORTIFY_SOURCE -Wall -DHAVE_OMXLIB -DUSE_EXTERNAL_FFMPEG -DHAVE_LIBAVCODEC_AVCODEC_H -DHAVE_LIBAVUTIL_OPT_H -DHAVE_LIBAVUTIL_MEM_H -DHAVE_LIBAVUTIL_AVUTIL_H -DHAVE_LIBAVFORMAT_AVFORMAT_H -DHAVE_LIBAVFILTER_AVFILTER_H -DHAVE_LIBSWRESAMPLE_SWRESAMPLE_H -DOMX -DOMX_SKIP64BIT -ftree-vectorize -DUSE_EXTERNAL_OMX -DTARGET_RASPBERRY_PI -DUSE_EXTERNAL_LIBBCM_HOST
@@ -27,11 +25,11 @@ Index: git/Makefile
+LDFLAGS+=-L./ -Lffmpeg_compiled/usr/lib/ -lc -lbrcmGLESv2 -lbrcmEGL -lbcm_host -lopenmaxil -lfreetype -lz -lasound
-INCLUDES+=-I./ -Ilinux -Iffmpeg_compiled/usr/local/include/ -I /usr/include/dbus-1.0 -I /usr/lib/arm-linux-gnueabihf/dbus-1.0/include -I/usr/include/freetype2 -isystem$(SDKSTAGE)/opt/vc/include -isystem$(SDKSTAGE)/opt/vc/include/interface/vcos/pthreads
-+INCLUDES+=-I./ -Ilinux
++INCLUDES+=-I./ -Ilinux -I./ffmpeg
DIST ?= omxplayer-dist
STRIP ?= strip
-@@ -90,7 +90,7 @@ dist: omxplayer.bin omxplayer.1
+@@ -91,7 +91,7 @@ dist: omxplayer.bin omxplayer.1
cp COPYING $(DIST)/usr/share/doc/omxplayer
cp README.md $(DIST)/usr/share/doc/omxplayer/README
cp omxplayer.1 $(DIST)/usr/share/man/man1
@@ -40,10 +38,8 @@ Index: git/Makefile
cd $(DIST); tar -czf ../$(DIST).tgz *
install:
-Index: git/Makefile.ffmpeg
-===================================================================
---- git.orig/Makefile.ffmpeg
-+++ git/Makefile.ffmpeg
+--- a/Makefile.ffmpeg
++++ b/Makefile.ffmpeg
@@ -238,7 +238,8 @@ configure:
--disable-decoder=xbin \
--disable-decoder=idf \
diff --git a/recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch b/recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch
index f6abd7b..0dd8c62 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0005-Don-t-require-internet-connection-during-build.patch
@@ -10,7 +10,7 @@ The following issues break offline builds:
* Makefile.ffmpeg explicitly does a "git clone" from the internet.
Signed-off-by: Paul Barker <pbarker@toganlabs.com>
-Upstream-status: Inappropriate
+Upstream-Status: Inappropriate
---
Makefile | 6 ++----
diff --git a/recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch b/recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch
index 890adde..81dab07 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0006-Prevent-ffmpeg-configure-compile-race-condition.patch
@@ -7,7 +7,7 @@ Additional dependency information is needed in Makefile.ffmpeg to ensure that
the configure stage is finished before the compile stage starts.
Signed-off-by: Paul Barker <pbarker@toganlabs.com>
-Upstream-status: Pending
+Upstream-Status: Pending
---
Makefile.ffmpeg | 4 ++--
diff --git a/recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch b/recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch
index a8c51d5..02844db 100644
--- a/recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/0007-Remove-Makefile-hardcoded-arch-tune.patch
@@ -1,3 +1,5 @@
+Upstream-Status: Pending
+
--- a/Makefile 2019-06-20 15:04:53.390282996 +0200
+++ b/Makefile 2019-06-20 15:03:45.538763872 +0200
@@ -1,4 +1,4 @@
diff --git a/recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch b/recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch
index 20ed7c7..5d7e1e0 100644
--- a/recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/cross-crompile-ffmpeg.patch
@@ -1,3 +1,5 @@
+Upstream-Status: Pending
+
Index: git/Makefile.ffmpeg
===================================================================
--- git.orig/Makefile.ffmpeg
diff --git a/recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch b/recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch
index e580470..e778561 100644
--- a/recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch
+++ b/recipes-multimedia/omxplayer/omxplayer/use-native-pkg-config.patch
@@ -9,6 +9,8 @@ to the default value which obviously is wrong.
Signed-off-by: Andrei Gherzan <andrei@gherzan.ro>
---
+Upstream-Status: Pending
+
Makefile.ffmpeg | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/recipes-multimedia/omxplayer/omxplayer_git.bb b/recipes-multimedia/omxplayer/omxplayer_git.bb
index 55e5a1c..b7eaf40 100644
--- a/recipes-multimedia/omxplayer/omxplayer_git.bb
+++ b/recipes-multimedia/omxplayer/omxplayer_git.bb
@@ -4,14 +4,16 @@ Raspberry PI implementation and is quite handy to use standalone"
HOMEPAGE = "https://github.com/popcornmix/omxplayer"
SECTION = "console/utils"
-LICENSE = "GPLv2"
+LICENSE = "GPL-2.0-only"
LIC_FILES_CHKSUM = "file://COPYING;md5=00a27da7ac0f9bcd17320ec29ef4bbf6"
-DEPENDS = "libpcre libav virtual/egl boost freetype dbus openssl libssh virtual/libomxil coreutils-native curl-native userland"
+DEPENDS = "alsa-lib libpcre virtual/egl boost freetype dbus openssl libssh virtual/libomxil coreutils-native curl-native userland"
-PR = "r5"
+PR = "r6"
-SRCREV_default = "f543a0d0e707ab56415f17b0ca6d397394ee8b63"
+SRCREV_FORMAT = "_ffmpeg"
+
+SRCREV_default = "1f1d0ccd65d3a1caa86dc79d2863a8f067c8e3f8"
# omxplayer builds its own copy of ffmpeg from source instead of using the
# system's ffmpeg library. This isn't ideal but it's ok for now. We do however
@@ -22,8 +24,8 @@ SRCREV_default = "f543a0d0e707ab56415f17b0ca6d397394ee8b63"
# This SRCREV corresponds to the v4.0.3 release of ffmpeg.
SRCREV_ffmpeg = "fcbd117df3077bad495e99e20f01cf93737bce76"
-SRC_URI = "git://github.com/popcornmix/omxplayer.git;protocol=git;branch=master \
- git://github.com/FFmpeg/FFmpeg;branch=release/4.0;protocol=git;depth=1;name=ffmpeg;destsuffix=git/ffmpeg \
+SRC_URI = "git://github.com/popcornmix/omxplayer.git;protocol=https;branch=master \
+ git://github.com/FFmpeg/FFmpeg;branch=release/4.0;protocol=https;depth=1;name=ffmpeg;destsuffix=git/ffmpeg \
file://0002-Libraries-and-headers-from-ffmpeg-are-installed-in-u.patch \
file://0003-Remove-strip-step-in-Makefile.patch \
file://0004-Add-FFMPEG_EXTRA_CFLAGS-and-FFMPEG_EXTRA_LDFLAGS.patch \
@@ -36,12 +38,12 @@ SRC_URI = "git://github.com/popcornmix/omxplayer.git;protocol=git;branch=master
file://0007-Remove-Makefile-hardcoded-arch-tune.patch \
"
-SRC_URI_append = "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", " file://0001-Fix-build-with-vc4-driver.patch ", "", d)}"
+SRC_URI:append = "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", " file://0001-Fix-build-with-vc4-driver.patch ", "", d)}"
S = "${WORKDIR}/git"
COMPATIBLE_MACHINE = "^rpi$"
-COMPATIBLE_HOST_aarch64 = "null"
+COMPATIBLE_HOST:aarch64 = "null"
def cpu(d):
for arg in (d.getVar('TUNE_CCARGS') or '').split():
@@ -83,6 +85,8 @@ export INCLUDES = "${@bb.utils.contains("MACHINE_FEATURES", "vc4graphics", " -D_
export DIST = "${D}"
do_compile() {
+ bbwarn "omxplayer is being deprecated and resources are directed at improving vlc."
+
# Needed for compiler test in ffmpeg's configure
mkdir -p tmp
@@ -99,10 +103,10 @@ do_install() {
install ${S}/fonts/* ${D}${datadir}/fonts/truetype/freefont/
}
-FILES_${PN} = "${bindir}/omxplayer* \
+FILES:${PN} = "${bindir}/omxplayer* \
${libdir}/omxplayer/lib*${SOLIBS} \
${datadir}/fonts"
-FILES_${PN}-dev += "${libdir}/omxplayer/*.so"
+FILES:${PN}-dev += "${libdir}/omxplayer/*.so"
-RDEPENDS_${PN} += "bash procps userland"
+RDEPENDS:${PN} += "bash procps userland"
diff --git a/recipes-multimedia/picamera-libs/picamera-libs.bb b/recipes-multimedia/picamera-libs/picamera-libs.bb
new file mode 100644
index 0000000..f873a19
--- /dev/null
+++ b/recipes-multimedia/picamera-libs/picamera-libs.bb
@@ -0,0 +1,26 @@
+SUMMARY = "Raspberrypi firmware libraries which are required by picamera library"
+DESCRIPTION = "Raspberrypi firmware libraries required by picamera library"
+LICENSE = "Broadcom-RPi"
+
+LIC_FILES_CHKSUM = "file://opt/vc/LICENCE;md5=86e53f5f5909ee66900418028de11780"
+
+include recipes-bsp/common/raspberrypi-firmware.inc
+
+S = "${RPIFW_S}"
+
+do_install(){
+ install -m 0755 -d ${D}${libdir}
+ install -m 0755 ${S}/opt/vc/lib/*.so ${D}${libdir}
+}
+
+FILES:${PN} = "${libdir}"
+
+#skipping the QA error since we are directly copying precompiled binaries
+INSANE_SKIP:${PN} = "ldflags"
+INHIBIT_PACKAGE_STRIP = "1"
+INHIBIT_SYSROOT_STRIP = "1"
+SOLIBS = ".so"
+FILES_SOLIBSDEV = ""
+
+COMPATIBLE_HOST = "null"
+COMPATIBLE_HOST:rpi:libc-glibc = "(arm.*)-linux"
diff --git a/recipes-multimedia/python3-picamera/python3-picamera_git.bb b/recipes-multimedia/python3-picamera/python3-picamera_git.bb
new file mode 100644
index 0000000..f14941b
--- /dev/null
+++ b/recipes-multimedia/python3-picamera/python3-picamera_git.bb
@@ -0,0 +1,22 @@
+SUMMARY = "Python interface to the Raspberry Pi camera module"
+DESCRIPTION = "This package provides a pure Python interface to the Raspberry Pi camera module for Python 2.7 (or above) or Python 3.2 (or above)."
+HOMEPAGE = "https://github.com/waveform80/picamera"
+
+LICENSE = "BSD-3-Clause"
+LIC_FILES_CHKSUM = "file://LICENSE.txt;md5=4de8aab427192e4a8322a71375d20e21"
+
+RDEPENDS:${PN} = "python3-numbers \
+ python3-ctypes \
+ python3-colorzero \
+ picamera-libs \
+"
+
+SRC_URI = "git://git@github.com/waveform80/picamera.git;protocol=ssh;branch=master"
+SRCREV = "7e4f1d379d698c44501fb84b886fadf3fc164b70"
+
+S = "${WORKDIR}/git"
+
+inherit setuptools3
+
+COMPATIBLE_HOST = "null"
+COMPATIBLE_HOST:rpi:libc-glibc = "(arm.*)-linux"
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch
new file mode 100644
index 0000000..d9c07dd
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch
@@ -0,0 +1,292 @@
+From: James Cowgill <jcowgill@debian.org>
+Date: Sun, 11 Aug 2019 16:50:56 +0100
+Subject: avcodec/arm/sbcenc: avoid callee preserved vfp registers
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+When compiling FFmpeg with GCC-9, some very random segfaults were
+observed in code which had previously called down into the SBC encoder
+NEON assembly routines. This was caused by these functions clobbering
+some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was
+using these registers to save local variables, but after these
+functions returned, they would contain garbage.
+
+Fix by reallocating the registers in the two affected functions in
+the following way:
+ ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11
+ ff_sbc_analyze_8_neon: q2-q9 => q8-q15
+
+The reason for using these replacements is to keep closely related
+sets of registers consecutively numbered which hopefully makes the
+code more easy to follow. Since this commit only reallocates
+registers, it should have no performance impact.
+
+Signed-off-by: James Cowgill <jcowgill@debian.org>
+---
+ libavcodec/arm/sbcdsp_neon.S | 220 +++++++++++++++++++++----------------------
+ 1 file changed, 110 insertions(+), 110 deletions(-)
+
+diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
+index d83d21d..914abfb 100644
+--- a/libavcodec/arm/sbcdsp_neon.S
++++ b/libavcodec/arm/sbcdsp_neon.S
+@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
+ /* TODO: merge even and odd cases (or even merge all four calls to this
+ * function) in order to have only aligned reads from 'in' array
+ * and reduce number of load instructions */
+- vld1.16 {d4, d5}, [r0, :64]!
+- vld1.16 {d8, d9}, [r2, :128]!
++ vld1.16 {d16, d17}, [r0, :64]!
++ vld1.16 {d20, d21}, [r2, :128]!
+
+- vmull.s16 q0, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmull.s16 q1, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
++ vmull.s16 q0, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmull.s16 q1, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
+
+- vmlal.s16 q0, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q1, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
++ vmlal.s16 q0, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q1, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
+
+- vmlal.s16 q0, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q1, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
++ vmlal.s16 q0, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q1, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
+
+- vmlal.s16 q0, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q1, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
++ vmlal.s16 q0, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q1, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
+
+- vmlal.s16 q0, d4, d8
+- vmlal.s16 q1, d5, d9
++ vmlal.s16 q0, d16, d20
++ vmlal.s16 q1, d17, d21
+
+ vpadd.s32 d0, d0, d1
+ vpadd.s32 d1, d2, d3
+
+ vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE
+
+- vld1.16 {d2, d3, d4, d5}, [r2, :128]!
++ vld1.16 {d16, d17, d18, d19}, [r2, :128]!
+
+ vdup.i32 d1, d0[1] /* TODO: can be eliminated */
+ vdup.i32 d0, d0[0] /* TODO: can be eliminated */
+
+- vmull.s16 q3, d2, d0
+- vmull.s16 q4, d3, d0
+- vmlal.s16 q3, d4, d1
+- vmlal.s16 q4, d5, d1
++ vmull.s16 q10, d16, d0
++ vmull.s16 q11, d17, d0
++ vmlal.s16 q10, d18, d1
++ vmlal.s16 q11, d19, d1
+
+- vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */
+- vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */
++ vpadd.s32 d0, d20, d21 /* TODO: can be eliminated */
++ vpadd.s32 d1, d22, d23 /* TODO: can be eliminated */
+
+ vst1.32 {d0, d1}, [r1, :128]
+
+@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
+ /* TODO: merge even and odd cases (or even merge all four calls to this
+ * function) in order to have only aligned reads from 'in' array
+ * and reduce number of load instructions */
+- vld1.16 {d4, d5}, [r0, :64]!
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmull.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmull.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+- vmull.s16 q8, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmull.s16 q9, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmlal.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+- vmlal.s16 q8, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q9, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmlal.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+- vmlal.s16 q8, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q9, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmlal.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+- vmlal.s16 q8, d6, d10
+- vld1.16 {d4, d5}, [r0, :64]!
+- vmlal.s16 q9, d7, d11
+- vld1.16 {d8, d9}, [r2, :128]!
+-
+- vmlal.s16 q6, d4, d8
+- vld1.16 {d6, d7}, [r0, :64]!
+- vmlal.s16 q7, d5, d9
+- vld1.16 {d10, d11}, [r2, :128]!
+-
+- vmlal.s16 q8, d6, d10
+- vmlal.s16 q9, d7, d11
+-
+- vpadd.s32 d0, d12, d13
+- vpadd.s32 d1, d14, d15
+- vpadd.s32 d2, d16, d17
+- vpadd.s32 d3, d18, d19
++ vld1.16 {d16, d17}, [r0, :64]!
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmull.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmull.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++ vmull.s16 q14, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmull.s16 q15, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmlal.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++ vmlal.s16 q14, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q15, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmlal.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++ vmlal.s16 q14, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q15, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmlal.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++ vmlal.s16 q14, d18, d22
++ vld1.16 {d16, d17}, [r0, :64]!
++ vmlal.s16 q15, d19, d23
++ vld1.16 {d20, d21}, [r2, :128]!
++
++ vmlal.s16 q12, d16, d20
++ vld1.16 {d18, d19}, [r0, :64]!
++ vmlal.s16 q13, d17, d21
++ vld1.16 {d22, d23}, [r2, :128]!
++
++ vmlal.s16 q14, d18, d22
++ vmlal.s16 q15, d19, d23
++
++ vpadd.s32 d0, d24, d25
++ vpadd.s32 d1, d26, d27
++ vpadd.s32 d2, d28, d29
++ vpadd.s32 d3, d30, d31
+
+ vrshr.s32 q0, q0, SBC_PROTO_FIXED_SCALE
+ vrshr.s32 q1, q1, SBC_PROTO_FIXED_SCALE
+@@ -153,38 +153,38 @@ function ff_sbc_analyze_8_neon, export=1
+ vdup.i32 d1, d0[1] /* TODO: can be eliminated */
+ vdup.i32 d0, d0[0] /* TODO: can be eliminated */
+
+- vld1.16 {d4, d5}, [r2, :128]!
+- vmull.s16 q6, d4, d0
+- vld1.16 {d6, d7}, [r2, :128]!
+- vmull.s16 q7, d5, d0
+- vmull.s16 q8, d6, d0
+- vmull.s16 q9, d7, d0
+-
+- vld1.16 {d4, d5}, [r2, :128]!
+- vmlal.s16 q6, d4, d1
+- vld1.16 {d6, d7}, [r2, :128]!
+- vmlal.s16 q7, d5, d1
+- vmlal.s16 q8, d6, d1
+- vmlal.s16 q9, d7, d1
+-
+- vld1.16 {d4, d5}, [r2, :128]!
+- vmlal.s16 q6, d4, d2
+- vld1.16 {d6, d7}, [r2, :128]!
+- vmlal.s16 q7, d5, d2
+- vmlal.s16 q8, d6, d2
+- vmlal.s16 q9, d7, d2
+-
+- vld1.16 {d4, d5}, [r2, :128]!
+- vmlal.s16 q6, d4, d3
+- vld1.16 {d6, d7}, [r2, :128]!
+- vmlal.s16 q7, d5, d3
+- vmlal.s16 q8, d6, d3
+- vmlal.s16 q9, d7, d3
+-
+- vpadd.s32 d0, d12, d13 /* TODO: can be eliminated */
+- vpadd.s32 d1, d14, d15 /* TODO: can be eliminated */
+- vpadd.s32 d2, d16, d17 /* TODO: can be eliminated */
+- vpadd.s32 d3, d18, d19 /* TODO: can be eliminated */
++ vld1.16 {d16, d17}, [r2, :128]!
++ vmull.s16 q12, d16, d0
++ vld1.16 {d18, d19}, [r2, :128]!
++ vmull.s16 q13, d17, d0
++ vmull.s16 q14, d18, d0
++ vmull.s16 q15, d19, d0
++
++ vld1.16 {d16, d17}, [r2, :128]!
++ vmlal.s16 q12, d16, d1
++ vld1.16 {d18, d19}, [r2, :128]!
++ vmlal.s16 q13, d17, d1
++ vmlal.s16 q14, d18, d1
++ vmlal.s16 q15, d19, d1
++
++ vld1.16 {d16, d17}, [r2, :128]!
++ vmlal.s16 q12, d16, d2
++ vld1.16 {d18, d19}, [r2, :128]!
++ vmlal.s16 q13, d17, d2
++ vmlal.s16 q14, d18, d2
++ vmlal.s16 q15, d19, d2
++
++ vld1.16 {d16, d17}, [r2, :128]!
++ vmlal.s16 q12, d16, d3
++ vld1.16 {d18, d19}, [r2, :128]!
++ vmlal.s16 q13, d17, d3
++ vmlal.s16 q14, d18, d3
++ vmlal.s16 q15, d19, d3
++
++ vpadd.s32 d0, d24, d25 /* TODO: can be eliminated */
++ vpadd.s32 d1, d26, d27 /* TODO: can be eliminated */
++ vpadd.s32 d2, d28, d29 /* TODO: can be eliminated */
++ vpadd.s32 d3, d30, d31 /* TODO: can be eliminated */
+
+ vst1.32 {d0, d1, d2, d3}, [r1, :128]
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch
new file mode 100644
index 0000000..f398791
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0002-Fix-build-on-powerpc-and-ppc64.patch
@@ -0,0 +1,34 @@
+From: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
+Date: Tue, 19 Jan 2021 20:35:29 +0100
+Subject: Fix build on powerpc and ppc64
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+---
+ libswscale/ppc/yuv2rgb_altivec.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/libswscale/ppc/yuv2rgb_altivec.c b/libswscale/ppc/yuv2rgb_altivec.c
+index 5365452..930ef6b 100644
+--- a/libswscale/ppc/yuv2rgb_altivec.c
++++ b/libswscale/ppc/yuv2rgb_altivec.c
+@@ -283,6 +283,16 @@ static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
+ * ------------------------------------------------------------------------------
+ */
+
++#if !HAVE_VSX
++static inline vector unsigned char vec_xl(signed long long offset, const ubyte *addr)
++{
++ const vector unsigned char *v_addr = (const vector unsigned char *) (addr + offset);
++ vector unsigned char align_perm = vec_lvsl(offset, addr);
++
++ return (vector unsigned char) vec_perm(v_addr[0], v_addr[1], align_perm);
++}
++#endif /* !HAVE_VSX */
++
+ #define DEFCSP420_CVT(name, out_pixels) \
+ static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
+ int *instrides, int srcSliceY, int srcSliceH, \
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch
new file mode 100644
index 0000000..11e3383
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch
@@ -0,0 +1,30 @@
+From: Paul B Mahol <onemda@gmail.com>
+Date: Sun, 14 Feb 2021 17:20:03 +0100
+Subject: avcodec/pngenc: remove monowhite from apng formats
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Monowhite pixel format is not supported, and it does not make sense
+to add support for it.
+
+Fixes #7989
+---
+ libavcodec/pngenc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/libavcodec/pngenc.c b/libavcodec/pngenc.c
+index efcae8c..eebb164 100644
+--- a/libavcodec/pngenc.c
++++ b/libavcodec/pngenc.c
+@@ -1174,7 +1174,7 @@ AVCodec ff_apng_encoder = {
+ AV_PIX_FMT_PAL8,
+ AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY8A,
+ AV_PIX_FMT_GRAY16BE, AV_PIX_FMT_YA16BE,
+- AV_PIX_FMT_MONOBLACK, AV_PIX_FMT_NONE
++ AV_PIX_FMT_NONE
+ },
+ .priv_class = &apngenc_class,
+ };
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch b/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch
new file mode 100644
index 0000000..740ac0e
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0004-ffmpeg-4.3.4-rpi_14.patch
@@ -0,0 +1,68341 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+--- a/configure
++++ b/configure
+@@ -207,6 +207,7 @@ External library support:
+ --disable-bzlib disable bzlib [autodetect]
+ --disable-coreimage disable Apple CoreImage framework [autodetect]
+ --enable-chromaprint enable audio fingerprinting with chromaprint [no]
++ --disable-epoxy disable epoxy [autodetect]
+ --enable-frei0r enable frei0r video filtering [no]
+ --enable-gcrypt enable gcrypt, needed for rtmp(t)e support
+ if openssl, librtmp or gmp is not used [no]
+@@ -274,6 +275,7 @@ External library support:
+ --enable-libtls enable LibreSSL (via libtls), needed for https support
+ if openssl, gnutls or mbedtls is not used [no]
+ --enable-libtwolame enable MP2 encoding via libtwolame [no]
++ --disable-libudev disable libudev [autodetect]
+ --enable-libv4l2 enable libv4l2/v4l-utils [no]
+ --enable-libvidstab enable video stabilization using vid.stab [no]
+ --enable-libvmaf enable vmaf filter via libvmaf [no]
+@@ -336,12 +338,17 @@ External library support:
+ --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
+ --enable-libnpp enable Nvidia Performance Primitives-based code [no]
+ --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
++ --enable-rpi enable other rpi specific stuff [no]
++ --enable-sand enable sand video formats [rpi]
++ --enable-vout-drm enable the vout_drm module - for internal testing only [no]
++ --enable-vout-egl enable the vout_egl module - for internal testing only [no]
+ --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
+ --disable-nvenc disable Nvidia video encoding code [autodetect]
+ --enable-omx enable OpenMAX IL code [no]
+ --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no]
+ --enable-rkmpp enable Rockchip Media Process Platform code [no]
+ --disable-v4l2-m2m disable V4L2 mem2mem code [autodetect]
++ --enable-v4l2-request enable V4L2 request API code [no]
+ --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
+ --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
+ --disable-videotoolbox disable VideoToolbox code [autodetect]
+@@ -1699,7 +1706,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
+ avfoundation
+ bzlib
+ coreimage
++ epoxy
+ iconv
++ libudev
+ libxcb
+ libxcb_shm
+ libxcb_shape
+@@ -1861,7 +1870,10 @@ HWACCEL_LIBRARY_LIST="
+ mmal
+ omx
+ opencl
++ v4l2_request
+ vulkan
++ rpi4_8
++ rpi4_10
+ "
+
+ DOCUMENT_LIST="
+@@ -1877,12 +1889,16 @@ FEATURE_LIST="
+ gray
+ hardcoded_tables
+ omx_rpi
++ rpi
+ runtime_cpudetect
+ safe_bitstream_reader
++ sand
+ shared
+ small
+ static
+ swscale_alpha
++ vout_drm
++ vout_egl
+ "
+
+ # this list should be kept in linking order
+@@ -1923,6 +1939,7 @@ SUBSYSTEM_LIST="
+ pixelutils
+ network
+ rdft
++ rpi
+ "
+
+ # COMPONENT_LIST needs to come last to ensure correct dependency checking
+@@ -2405,9 +2422,11 @@ CONFIG_EXTRA="
+ rangecoder
+ riffdec
+ riffenc
++ rpi
+ rtpdec
+ rtpenc_chain
+ rv34dsp
++ sand
+ scene_sad
+ sinewin
+ snappy
+@@ -2737,6 +2756,8 @@ hap_decoder_select="snappy texturedsp"
+ hap_encoder_deps="libsnappy"
+ hap_encoder_select="texturedspenc"
+ hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
++hevc_rpi_decoder_deps="rpi"
++hevc_rpi_decoder_select="hevc_decoder sand"
+ huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
+ huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
+ hymt_decoder_select="huffyuv_decoder"
+@@ -2903,6 +2924,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder
+ dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
+ ffnvcodec_deps_any="libdl LoadLibrary"
+ nvdec_deps="ffnvcodec"
++v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
+ vaapi_x11_deps="xlib"
+ videotoolbox_hwaccel_deps="videotoolbox pthreads"
+ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
+@@ -2934,6 +2956,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicP
+ hevc_dxva2_hwaccel_select="hevc_decoder"
+ hevc_nvdec_hwaccel_deps="nvdec"
+ hevc_nvdec_hwaccel_select="hevc_decoder"
++hevc_v4l2request_hwaccel_deps="v4l2_request"
++hevc_v4l2request_hwaccel_select="hevc_decoder"
++hevc_rpi4_10_hwaccel_deps="rpi"
++hevc_rpi4_10_hwaccel_select="hevc_decoder"
++hevc_rpi4_8_hwaccel_deps="rpi"
++hevc_rpi4_8_hwaccel_select="hevc_decoder"
+ hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
+ hevc_vaapi_hwaccel_select="hevc_decoder"
+ hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
+@@ -3401,8 +3429,13 @@ sndio_indev_deps="sndio"
+ sndio_outdev_deps="sndio"
+ v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_indev_suggest="libv4l2"
++v4l2_outdev_deps="libdrm"
+ v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_outdev_suggest="libv4l2"
++vout_drm_outdev_deps="libdrm"
++vout_egl_outdev_deps="xlib epoxy"
++vout_rpi_outdev_deps="rpi"
++vout_rpi_outdev_select="sand"
+ vfwcap_indev_deps="vfw32 vfwcap_defines"
+ xcbgrab_indev_deps="libxcb"
+ xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
+@@ -3618,6 +3651,7 @@ tonemap_vaapi_filter_deps="vaapi VAProcF
+ tonemap_opencl_filter_deps="opencl const_nan"
+ transpose_opencl_filter_deps="opencl"
+ transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
++unsand_filter_select="sand"
+ unsharp_opencl_filter_deps="opencl"
+ uspp_filter_deps="gpl avcodec"
+ vaguedenoiser_filter_deps="gpl"
+@@ -6102,6 +6136,12 @@ check_func_headers glob.h glob
+ enabled xlib &&
+ check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext
+
++enabled libudev &&
++ check_pkg_config libudev libudev libudev.h udev_new
++
++enabled epoxy &&
++ check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
++
+ check_headers direct.h
+ check_headers dirent.h
+ check_headers dxgidebug.h
+@@ -6430,11 +6470,12 @@ enabled mbedtls && { check_pkg
+ check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
+ die "ERROR: mbedTLS not found"; }
+ enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
+-enabled mmal && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
++( enabled rpi ||
++ enabled mmal ) && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
+ { ! enabled cross_compile &&
+ add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+ add_ldflags -L/opt/vc/lib/ &&
+- check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } ||
++ check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
+ die "ERROR: mmal not found" &&
+ check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
+ enabled openal && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
+@@ -6475,8 +6516,16 @@ enabled rkmpp && { require_p
+ { enabled libdrm ||
+ die "ERROR: rkmpp requires --enable-libdrm"; }
+ }
++enabled v4l2_request && { enabled libdrm ||
++ die "ERROR: v4l2-request requires --enable-libdrm"; } &&
++ { enabled libudev ||
++ die "ERROR: v4l2-request requires libudev"; }
+ enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
+
++enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
++
++enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
++ { enabled xlib || die "ERROR: vout_egl requires xlib"; }
+
+ if enabled gcrypt; then
+ GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
+@@ -6556,6 +6605,8 @@ if enabled v4l2_m2m; then
+ check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
+ fi
+
++check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
++check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
+ check_headers sys/videoio.h
+ test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+
+--- a/fftools/ffmpeg.c
++++ b/fftools/ffmpeg.c
+@@ -2119,8 +2119,8 @@ static int ifilter_send_frame(InputFilte
+ ifilter->channel_layout != frame->channel_layout;
+ break;
+ case AVMEDIA_TYPE_VIDEO:
+- need_reinit |= ifilter->width != frame->width ||
+- ifilter->height != frame->height;
++ need_reinit |= ifilter->width != av_frame_cropped_width(frame) ||
++ ifilter->height != av_frame_cropped_height(frame);
+ break;
+ }
+
+@@ -2131,6 +2131,9 @@ static int ifilter_send_frame(InputFilte
+ (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
+ need_reinit = 1;
+
++ if (no_cvt_hw && fg->graph)
++ need_reinit = 0;
++
+ if (need_reinit) {
+ ret = ifilter_parameters_from_frame(ifilter, frame);
+ if (ret < 0)
+@@ -2401,8 +2404,7 @@ static int decode_video(InputStream *ist
+ decoded_frame->top_field_first = ist->top_field_first;
+
+ ist->frames_decoded++;
+-
+- if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
++ if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
+ err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
+ if (err < 0)
+ goto fail;
+@@ -2600,7 +2602,12 @@ static int process_input_packet(InputStr
+ case AVMEDIA_TYPE_VIDEO:
+ ret = decode_video (ist, repeating ? NULL : &avpkt, &got_output, &duration_pts, !pkt,
+ &decode_failed);
+- if (!repeating || !pkt || got_output) {
++ // Pi: Do not inc dts if no_cvt_hw set
++ // V4L2 H264 decode has long latency and sometimes spits out a long
++ // stream of output without input. In this case incrementing DTS is wrong.
++ // There may be cases where the condition as written is correct so only
++ // "fix" in the cases which cause problems
++ if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
+ if (pkt && pkt->duration) {
+ duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
+ } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
+@@ -2820,6 +2827,16 @@ static enum AVPixelFormat get_format(AVC
+ } else {
+ const HWAccel *hwaccel = NULL;
+ int i;
++
++ if (no_cvt_hw) {
++ config = avcodec_get_hw_config(s->codec, 0);
++ if (config->methods == AV_CODEC_HW_CONFIG_METHOD_INTERNAL) {
++ av_log(s, AV_LOG_DEBUG, "no_cvt_hw so accepting pix_fmt %d with codec internal hwaccel\n", *p);
++ ist->hwaccel_pix_fmt = *p;
++ break;
++ }
++ }
++
+ for (i = 0; hwaccels[i].name; i++) {
+ if (hwaccels[i].pix_fmt == *p) {
+ hwaccel = &hwaccels[i];
+@@ -2914,6 +2931,15 @@ static int init_input_stream(int ist_ind
+ return ret;
+ }
+
++#if CONFIG_HEVC_RPI_DECODER
++ ret = -1;
++ if (strcmp(codec->name, "hevc_rpi") == 0 &&
++ (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
++ ist->dec = codec = avcodec_find_decoder_by_name("hevc");
++ av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n");
++ }
++ if (ret < 0)
++#endif
+ if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
+ if (ret == AVERROR_EXPERIMENTAL)
+ abort_codec_experimental(codec, 0);
+--- a/fftools/ffmpeg.h
++++ b/fftools/ffmpeg.h
+@@ -61,6 +61,7 @@ enum HWAccelID {
+ HWACCEL_GENERIC,
+ HWACCEL_VIDEOTOOLBOX,
+ HWACCEL_QSV,
++ HWACCEL_RPI,
+ };
+
+ typedef struct HWAccel {
+@@ -590,6 +591,7 @@ extern int video_sync_method;
+ extern float frame_drop_threshold;
+ extern int do_benchmark;
+ extern int do_benchmark_all;
++extern int no_cvt_hw;
+ extern int do_deinterlace;
+ extern int do_hex_dump;
+ extern int do_pkt_dump;
+--- a/fftools/ffmpeg_filter.c
++++ b/fftools/ffmpeg_filter.c
+@@ -1186,8 +1186,8 @@ int ifilter_parameters_from_frame(InputF
+
+ ifilter->format = frame->format;
+
+- ifilter->width = frame->width;
+- ifilter->height = frame->height;
++ ifilter->width = av_frame_cropped_width(frame);
++ ifilter->height = av_frame_cropped_height(frame);
+ ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
+
+ ifilter->sample_rate = frame->sample_rate;
+--- a/fftools/ffmpeg_hw.c
++++ b/fftools/ffmpeg_hw.c
+@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum
+ char *name;
+ size_t index_pos;
+ int index, index_limit = 1000;
++ if (!type_name)
++ return NULL;
+ index_pos = strlen(type_name);
+ name = av_malloc(index_pos + 4);
+ if (!name)
+--- a/fftools/ffmpeg_opt.c
++++ b/fftools/ffmpeg_opt.c
+@@ -130,6 +130,12 @@ static const char *opt_name_enc_time_bas
+ }\
+ }
+
++#if CONFIG_RPI
++static int rpi_init(AVCodecContext *avctx) {
++ return 0;
++}
++#endif
++
+ const HWAccel hwaccels[] = {
+ #if CONFIG_VIDEOTOOLBOX
+ { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX },
+@@ -137,6 +143,10 @@ const HWAccel hwaccels[] = {
+ #if CONFIG_LIBMFX
+ { "qsv", qsv_init, HWACCEL_QSV, AV_PIX_FMT_QSV },
+ #endif
++#if CONFIG_RPI
++ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 },
++ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 },
++#endif
+ { 0 },
+ };
+ HWDevice *filter_hw_device;
+@@ -155,6 +165,7 @@ float frame_drop_threshold = 0;
+ int do_deinterlace = 0;
+ int do_benchmark = 0;
+ int do_benchmark_all = 0;
++int no_cvt_hw = 0;
+ int do_hex_dump = 0;
+ int do_pkt_dump = 0;
+ int copy_ts = 0;
+@@ -3460,6 +3471,8 @@ const OptionDef options[] = {
+ "add timings for benchmarking" },
+ { "benchmark_all", OPT_BOOL | OPT_EXPERT, { &do_benchmark_all },
+ "add timings for each task" },
++ { "no_cvt_hw", OPT_BOOL | OPT_EXPERT, { &no_cvt_hw },
++ "do not auto-convert hw frames to sw" },
+ { "progress", HAS_ARG | OPT_EXPERT, { .func_arg = opt_progress },
+ "write program-readable progress information", "url" },
+ { "stdin", OPT_BOOL | OPT_EXPERT, { &stdin_interaction },
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h
+ mediacodec.h \
+ packet.h \
+ qsv.h \
++ rpi_zc.h \
+ vaapi.h \
+ vdpau.h \
+ version.h \
+@@ -138,6 +139,7 @@ OBJS-$(CONFIG_QSVDEC) +
+ OBJS-$(CONFIG_QSVENC) += qsvenc.o
+ OBJS-$(CONFIG_RANGECODER) += rangecoder.o
+ OBJS-$(CONFIG_RDFT) += rdft.o
++OBJS-$(CONFIG_RPI) += rpi_qpu.o rpi_mailbox.o rpi_zc.o
+ OBJS-$(CONFIG_RV34DSP) += rv34dsp.o
+ OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o
+ OBJS-$(CONFIG_SINEWIN) += sinewin.o sinewin_fixed.o
+@@ -152,7 +154,10 @@ OBJS-$(CONFIG_VIDEODSP) +
+ OBJS-$(CONFIG_VP3DSP) += vp3dsp.o
+ OBJS-$(CONFIG_VP56DSP) += vp56dsp.o
+ OBJS-$(CONFIG_VP8DSP) += vp8dsp.o
+-OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
++OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
++ weak_link.o
++OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
++ v4l2_req_devscan.o weak_link.o
+ OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o
+ OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o
+
+@@ -391,6 +396,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER) +
+ OBJS-$(CONFIG_HEVC_QSV_ENCODER) += qsvenc_hevc.o hevc_ps_enc.o \
+ hevc_data.o
+ OBJS-$(CONFIG_HEVC_RKMPP_DECODER) += rkmppdec.o
++OBJS-$(CONFIG_RPI) += rpi_mem.o \
++ rpi_mailbox.o rpi_zc.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER) += rpi_hevcdec.o rpi_hevc_mvs.o \
++ rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o \
++ rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o \
++ rpi_hevc_shader.o rpi_hevc_shader_template.o \
++ rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
++ rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o
+ OBJS-$(CONFIG_HEVC_VAAPI_ENCODER) += vaapi_encode_h265.o h265_profile_level.o
+ OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER) += v4l2_m2m_dec.o
+ OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER) += v4l2_m2m_enc.o
+@@ -909,6 +922,10 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)
+ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o
+ OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec_h2645.o
++OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL) += rpivid_hevc.o
++OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL) += rpivid_hevc.o
++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o v4l2_req_decode_q.o\
++ v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o v4l2_req_hevc_v4.o
+ OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o
+ OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o
+ OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o
+@@ -1261,3 +1278,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+ $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+ $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
+ endif
++
++ifdef CONFIG_HEVC_RPI_DECODER
++QASM_PY := ../local/bin/qasm.py
++VASMVIDCORE := ../local/bin/vasmvidcore_std
++
++ifneq ("$(wildcard $(QASM_PY))","")
++$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
++ $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
++
++$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
++ $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
++endif
++
++ifneq ("$(wildcard $(VASMVIDCORE))","")
++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
++
++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
++ python pi-util/make_array.py $<
++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
++ python pi-util/make_array.py $<
++endif
++
++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
++$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
++endif
+--- a/libavcodec/aarch64/Makefile
++++ b/libavcodec/aarch64/Makefile
+@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED)
+ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
+ aarch64/hpeldsp_neon.o
+ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
+-NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o
++NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \
++ aarch64/simple_idct_neon.o
+ NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
+ NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
+ NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
++NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o
+ NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
+
+ # decoders/encoders
+--- a/libavcodec/aarch64/idctdsp_init_aarch64.c
++++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
+@@ -27,19 +27,29 @@
+ #include "libavcodec/idctdsp.h"
+ #include "idct.h"
+
++void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++
+ av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth)
+ {
+ int cpu_flags = av_get_cpu_flags();
+
+- if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
+- if (avctx->idct_algo == FF_IDCT_AUTO ||
+- avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+- avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+- c->idct_put = ff_simple_idct_put_neon;
+- c->idct_add = ff_simple_idct_add_neon;
+- c->idct = ff_simple_idct_neon;
+- c->perm_type = FF_IDCT_PERM_PARTTRANS;
++ if (have_neon(cpu_flags)) {
++ if (!avctx->lowres && !high_bit_depth) {
++ if (avctx->idct_algo == FF_IDCT_AUTO ||
++ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
++ avctx->idct_algo == FF_IDCT_SIMPLENEON) {
++ c->idct_put = ff_simple_idct_put_neon;
++ c->idct_add = ff_simple_idct_add_neon;
++ c->idct = ff_simple_idct_neon;
++ c->perm_type = FF_IDCT_PERM_PARTTRANS;
++ }
+ }
++
++ c->add_pixels_clamped = ff_add_pixels_clamped_neon;
++ c->put_pixels_clamped = ff_put_pixels_clamped_neon;
++ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+ }
+ }
+--- /dev/null
++++ b/libavcodec/aarch64/idctdsp_neon.S
+@@ -0,0 +1,130 @@
++/*
++ * IDCT AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// Clamp 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++// x0 -> array of 64x 16-bit coefficients
++// x1 -> 8-bit results
++// x2 = row stride for results, bytes
++function ff_put_pixels_clamped_neon, export=1
++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ sqxtun v4.8b, v4.8h
++ st1 {v0.8b}, [x1], x2
++ sqxtun v0.8b, v5.8h
++ st1 {v1.8b}, [x1], x2
++ sqxtun v1.8b, v6.8h
++ st1 {v2.8b}, [x1], x2
++ sqxtun v2.8b, v7.8h
++ st1 {v3.8b}, [x1], x2
++ st1 {v4.8b}, [x1], x2
++ st1 {v0.8b}, [x1], x2
++ st1 {v1.8b}, [x1], x2
++ st1 {v2.8b}, [x1]
++ ret
++endfunc
++
++// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
++// On entry:
++// x0 -> array of 64x 16-bit coefficients
++// x1 -> 8-bit results
++// x2 = row stride for results, bytes
++function ff_put_signed_pixels_clamped_neon, export=1
++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++ movi v4.8b, #128
++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++ sqxtn v0.8b, v0.8h
++ sqxtn v1.8b, v1.8h
++ sqxtn v2.8b, v2.8h
++ sqxtn v3.8b, v3.8h
++ sqxtn v5.8b, v16.8h
++ add v0.8b, v0.8b, v4.8b
++ sqxtn v6.8b, v17.8h
++ add v1.8b, v1.8b, v4.8b
++ sqxtn v7.8b, v18.8h
++ add v2.8b, v2.8b, v4.8b
++ sqxtn v16.8b, v19.8h
++ add v3.8b, v3.8b, v4.8b
++ st1 {v0.8b}, [x1], x2
++ add v0.8b, v5.8b, v4.8b
++ st1 {v1.8b}, [x1], x2
++ add v1.8b, v6.8b, v4.8b
++ st1 {v2.8b}, [x1], x2
++ add v2.8b, v7.8b, v4.8b
++ st1 {v3.8b}, [x1], x2
++ add v3.8b, v16.8b, v4.8b
++ st1 {v0.8b}, [x1], x2
++ st1 {v1.8b}, [x1], x2
++ st1 {v2.8b}, [x1], x2
++ st1 {v3.8b}, [x1]
++ ret
++endfunc
++
++// Add 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++// x0 -> array of 64x 16-bit coefficients
++// x1 -> 8-bit input and results
++// x2 = row stride for 8-bit input and results, bytes
++function ff_add_pixels_clamped_neon, export=1
++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++ mov x3, x1
++ ld1 {v4.8b}, [x1], x2
++ ld1 {v5.8b}, [x1], x2
++ ld1 {v6.8b}, [x1], x2
++ ld1 {v7.8b}, [x1], x2
++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++ uaddw v0.8h, v0.8h, v4.8b
++ uaddw v1.8h, v1.8h, v5.8b
++ uaddw v2.8h, v2.8h, v6.8b
++ ld1 {v4.8b}, [x1], x2
++ uaddw v3.8h, v3.8h, v7.8b
++ ld1 {v5.8b}, [x1], x2
++ sqxtun v0.8b, v0.8h
++ ld1 {v6.8b}, [x1], x2
++ sqxtun v1.8b, v1.8h
++ ld1 {v7.8b}, [x1]
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ uaddw v4.8h, v16.8h, v4.8b
++ st1 {v0.8b}, [x3], x2
++ uaddw v0.8h, v17.8h, v5.8b
++ st1 {v1.8b}, [x3], x2
++ uaddw v1.8h, v18.8h, v6.8b
++ st1 {v2.8b}, [x3], x2
++ uaddw v2.8h, v19.8h, v7.8b
++ sqxtun v4.8b, v4.8h
++ sqxtun v0.8b, v0.8h
++ st1 {v3.8b}, [x3], x2
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ st1 {v4.8b}, [x3], x2
++ st1 {v0.8b}, [x3], x2
++ st1 {v1.8b}, [x3], x2
++ st1 {v2.8b}, [x3]
++ ret
++endfunc
+--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
++++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
+@@ -21,10 +21,28 @@
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/aarch64/cpu.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+
+ #include "config.h"
+
++void ff_vc1_inv_trans_8x8_neon(int16_t *block);
++void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++
+ void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+ void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++ /* Dealing with starting and stopping, and removing escape bytes, are
++ * comparatively less time-sensitive, so are more clearly expressed using
++ * a C wrapper around the assembly inner loop. Note that we assume a
++ * little-endian machine that supports unaligned loads. */
++ int dsize = 0;
++ while (size >= 4)
++ {
++ int found = 0;
++ while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++ {
++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++ if (!found)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ }
++ if (!found)
++ {
++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++ dst += skip;
++ src += skip;
++ size -= skip;
++ dsize += skip;
++ while (!found && size >= 4)
++ {
++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++ if (!found)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ }
++ }
++ if (found)
++ {
++ *dst++ = *src++;
++ *dst++ = *src++;
++ ++src;
++ size -= 3;
++ dsize += 2;
++ }
++ }
++ while (size > 0)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ return dsize;
++}
++
+ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
+ {
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
++ dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
++ dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
++ dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
++ dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
++ dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
++ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
++ dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
++ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
++
++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon;
++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon;
++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon;
++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon;
++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+ }
+ }
+--- /dev/null
++++ b/libavcodec/aarch64/vc1dsp_neon.S
+@@ -0,0 +1,1546 @@
++/*
++ * VC1 AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// VC-1 8x8 inverse transform
++// On entry:
++// x0 -> array of 16-bit inverse transform coefficients, in column-major order
++// On exit:
++// array at x0 updated to hold transformed block; also now held in row-major order
++function ff_vc1_inv_trans_8x8_neon, export=1
++ ld1 {v1.16b, v2.16b}, [x0], #32
++ ld1 {v3.16b, v4.16b}, [x0], #32
++ ld1 {v5.16b, v6.16b}, [x0], #32
++ shl v1.8h, v1.8h, #2 // 8/2 * src[0]
++ sub x1, x0, #3*32
++ ld1 {v16.16b, v17.16b}, [x0]
++ shl v7.8h, v2.8h, #4 // 16 * src[8]
++ shl v18.8h, v2.8h, #2 // 4 * src[8]
++ shl v19.8h, v4.8h, #4 // 16 * src[24]
++ ldr d0, .Lcoeffs_it8
++ shl v5.8h, v5.8h, #2 // 8/2 * src[32]
++ shl v20.8h, v6.8h, #4 // 16 * src[40]
++ shl v21.8h, v6.8h, #2 // 4 * src[40]
++ shl v22.8h, v17.8h, #4 // 16 * src[56]
++ ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40]
++ mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16]
++ sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40]
++ ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56]
++ sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56]
++ shl v3.8h, v3.8h, #3 // 16/2 * src[16]
++ mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
++ ssra v1.8h, v1.8h, #1 // 12/2 * src[0]
++ ssra v5.8h, v5.8h, #1 // 12/2 * src[32]
++ mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
++ shl v21.8h, v16.8h, #3 // 16/2 * src[48]
++ mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
++ sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
++ mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
++ add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
++ sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
++ mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
++ mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
++ add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2
++ sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2
++ mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
++ add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2
++ add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
++ mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
++ sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
++ add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
++ mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
++ sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2
++ sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
++ neg v3.8h, v7.8h // -t1
++ neg v4.8h, v20.8h // +t2
++ neg v6.8h, v19.8h // +t3
++ ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1
++ ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1
++ neg v7.8h, v18.8h // +t4
++ ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1
++ ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1
++ ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1
++ ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1
++ ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1
++ ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1
++ srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3
++ srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3
++ srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3
++ srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3
++ srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3
++ srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3
++ srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3
++ srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3
++ trn2 v17.8h, v3.8h, v4.8h
++ trn2 v18.8h, v5.8h, v6.8h
++ trn2 v19.8h, v2.8h, v1.8h
++ trn2 v20.8h, v7.8h, v16.8h
++ trn1 v21.4s, v17.4s, v18.4s
++ trn2 v17.4s, v17.4s, v18.4s
++ trn1 v18.4s, v19.4s, v20.4s
++ trn2 v19.4s, v19.4s, v20.4s
++ trn1 v3.8h, v3.8h, v4.8h
++ trn2 v4.2d, v21.2d, v18.2d
++ trn1 v20.2d, v17.2d, v19.2d
++ trn1 v5.8h, v5.8h, v6.8h
++ trn1 v1.8h, v2.8h, v1.8h
++ trn1 v2.8h, v7.8h, v16.8h
++ trn1 v6.2d, v21.2d, v18.2d
++ trn2 v7.2d, v17.2d, v19.2d
++ shl v16.8h, v20.8h, #4 // 16 * src[24]
++ shl v17.8h, v4.8h, #4 // 16 * src[40]
++ trn1 v18.4s, v3.4s, v5.4s
++ trn1 v19.4s, v1.4s, v2.4s
++ shl v21.8h, v7.8h, #4 // 16 * src[56]
++ shl v22.8h, v6.8h, #2 // 4 * src[8]
++ shl v23.8h, v4.8h, #2 // 4 * src[40]
++ trn2 v3.4s, v3.4s, v5.4s
++ trn2 v1.4s, v1.4s, v2.4s
++ shl v2.8h, v6.8h, #4 // 16 * src[8]
++ sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40]
++ ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40]
++ sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56]
++ trn1 v22.2d, v18.2d, v19.2d
++ trn2 v18.2d, v18.2d, v19.2d
++ trn1 v19.2d, v3.2d, v1.2d
++ ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56]
++ mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
++ shl v21.8h, v22.8h, #2 // 8/2 * src[0]
++ shl v18.8h, v18.8h, #2 // 8/2 * src[32]
++ mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
++ shl v6.8h, v19.8h, #3 // 16/2 * src[16]
++ trn2 v1.2d, v3.2d, v1.2d
++ mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
++ ssra v21.8h, v21.8h, #1 // 12/2 * src[0]
++ ssra v18.8h, v18.8h, #1 // 12/2 * src[32]
++ mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16]
++ shl v19.8h, v1.8h, #3 // 16/2 * src[48]
++ mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
++ add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
++ mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
++ sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
++ sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
++ mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
++ mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
++ add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
++ add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2
++ mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
++ sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
++ neg v21.8h, v17.8h // +t2
++ mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
++ sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2
++ neg v4.8h, v5.8h // +t3
++ sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
++ sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2
++ neg v24.8h, v16.8h // +t4
++ add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2
++ add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
++ ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1
++ neg v3.8h, v2.8h // -t1
++ ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1
++ ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1
++ ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1
++ srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1
++ srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1
++ srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1
++ srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1
++ srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7
++ srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7
++ srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7
++ srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7
++ srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7
++ srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7
++ st1 {v2.16b, v3.16b}, [x1], #32
++ srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7
++ srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7
++ st1 {v4.16b, v5.16b}, [x1], #32
++ st1 {v16.16b, v17.16b}, [x1], #32
++ st1 {v0.16b, v1.16b}, [x1]
++ ret
++endfunc
++
++// VC-1 8x4 inverse transform
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> array of 16-bit inverse transform coefficients, in row-major order
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_neon, export=1
++ ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
++ mov x3, x0
++ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector
++ ld1 {v5.8b}, [x0], x1
++ trn2 v6.4h, v1.4h, v3.4h
++ trn2 v7.4h, v2.4h, v4.4h
++ trn1 v1.4h, v1.4h, v3.4h
++ trn1 v2.4h, v2.4h, v4.4h
++ trn2 v3.4h, v16.4h, v18.4h
++ trn2 v4.4h, v17.4h, v19.4h
++ trn1 v16.4h, v16.4h, v18.4h
++ trn1 v17.4h, v17.4h, v19.4h
++ ld1 {v18.8b}, [x0], x1
++ trn1 v19.2s, v6.2s, v3.2s
++ trn2 v3.2s, v6.2s, v3.2s
++ trn1 v6.2s, v7.2s, v4.2s
++ trn2 v4.2s, v7.2s, v4.2s
++ trn1 v7.2s, v1.2s, v16.2s
++ trn1 v20.2s, v2.2s, v17.2s
++ shl v21.4h, v19.4h, #4 // 16 * src[1]
++ trn2 v1.2s, v1.2s, v16.2s
++ shl v16.4h, v3.4h, #4 // 16 * src[3]
++ trn2 v2.2s, v2.2s, v17.2s
++ shl v17.4h, v6.4h, #4 // 16 * src[5]
++ ld1 {v22.8b}, [x0], x1
++ shl v23.4h, v4.4h, #4 // 16 * src[7]
++ mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2]
++ ld1 {v25.8b}, [x0]
++ shl v26.4h, v19.4h, #2 // 4 * src[1]
++ shl v27.4h, v6.4h, #2 // 4 * src[5]
++ ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7]
++ ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5]
++ sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7]
++ sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5]
++ shl v7.4h, v7.4h, #2 // 8/2 * src[0]
++ shl v20.4h, v20.4h, #2 // 8/2 * src[4]
++ mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7]
++ shl v1.4h, v1.4h, #3 // 16/2 * src[2]
++ mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5]
++ ssra v7.4h, v7.4h, #1 // 12/2 * src[0]
++ mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5]
++ ssra v20.4h, v20.4h, #1 // 12/2 * src[4]
++ mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7]
++ shl v3.4h, v2.4h, #3 // 16/2 * src[6]
++ mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6]
++ mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7]
++ mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7]
++ sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6]
++ mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7]
++ add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4]
++ mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7]
++ sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4]
++ neg v6.4h, v21.4h // -t1
++ add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
++ sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
++ add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2
++ sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2
++ add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
++ add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2
++ sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2
++ sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
++ neg v3.4h, v17.4h // +t2
++ neg v4.4h, v16.4h // +t3
++ neg v28.4h, v23.4h // +t4
++ ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1
++ ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1
++ ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1
++ ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1
++ ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1
++ ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1
++ ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1
++ ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1
++ trn1 v1.2d, v7.2d, v1.2d
++ trn1 v2.2d, v20.2d, v2.2d
++ trn1 v3.2d, v24.2d, v27.2d
++ trn1 v4.2d, v19.2d, v26.2d
++ srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
++ srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
++ srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
++ srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
++ trn2 v6.8h, v1.8h, v2.8h
++ trn1 v1.8h, v1.8h, v2.8h
++ trn2 v2.8h, v3.8h, v4.8h
++ trn1 v3.8h, v3.8h, v4.8h
++ trn2 v4.4s, v6.4s, v2.4s
++ trn1 v7.4s, v1.4s, v3.4s
++ trn2 v1.4s, v1.4s, v3.4s
++ mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24]
++ trn1 v2.4s, v6.4s, v2.4s
++ mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24]
++ mul v6.8h, v7.8h, v0.h[6] // 17 * src[0]
++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[16]
++ mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
++ mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24]
++ add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16]
++ sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16]
++ neg v2.8h, v3.8h // -t4/2
++ neg v6.8h, v4.8h // -t3/2
++ ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1
++ ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1
++ ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1
++ ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1
++ srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7
++ srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7
++ srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7
++ srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7
++ uaddw v0.8h, v0.8h, v5.8b
++ uaddw v1.8h, v1.8h, v18.8b
++ uaddw v2.8h, v2.8h, v22.8b
++ uaddw v3.8h, v3.8h, v25.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ st1 {v0.8b}, [x3], x1
++ st1 {v1.8b}, [x3], x1
++ st1 {v2.8b}, [x3], x1
++ st1 {v3.8b}, [x3]
++ ret
++endfunc
++
++// VC-1 4x8 inverse transform
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_neon, export=1
++ mov x3, #16
++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector
++ mov x4, x0
++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03
++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13
++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
++ ld1 {v4.d}[0], [x2], x3 // 30 31 32 33
++ ld1 {v1.d}[1], [x2], x3 // 40 41 42 43
++ ld1 {v2.d}[1], [x2], x3 // 50 51 52 53
++ ld1 {v3.d}[1], [x2], x3 // 60 61 62 63
++ ld1 {v4.d}[1], [x2] // 70 71 72 73
++ ld1 {v5.s}[0], [x0], x1
++ ld1 {v6.s}[0], [x0], x1
++ ld1 {v7.s}[0], [x0], x1
++ trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53
++ trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52
++ trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73
++ trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72
++ ld1 {v4.s}[0], [x0], x1
++ trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73
++ trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70
++ trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71
++ mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3]
++ ld1 {v5.s}[1], [x0], x1
++ mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3]
++ ld1 {v6.s}[1], [x0], x1
++ trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72
++ mul v3.8h, v18.8h, v0.h[6] // 17 * src[0]
++ ld1 {v7.s}[1], [x0], x1
++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[2]
++ ld1 {v4.s}[1], [x0]
++ mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
++ mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3]
++ add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2]
++ sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2]
++ neg v3.8h, v16.8h // -t3/2
++ ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1
++ neg v18.8h, v17.8h // -t4/2
++ ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1
++ ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1
++ ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1
++ srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3
++ srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3
++ srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3
++ srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3
++ trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73
++ trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71
++ trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61
++ trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63
++ trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53
++ trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73
++ trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43
++ mov d18, v3.d[1] // 50 51 52 53
++ shl v19.4h, v3.4h, #4 // 16 * src[8]
++ mov d20, v16.d[1] // 70 71 72 73
++ shl v21.4h, v16.4h, #4 // 16 * src[24]
++ mov d22, v17.d[1] // 40 41 42 43
++ shl v23.4h, v3.4h, #2 // 4 * src[8]
++ shl v24.4h, v18.4h, #4 // 16 * src[40]
++ shl v25.4h, v20.4h, #4 // 16 * src[56]
++ shl v26.4h, v18.4h, #2 // 4 * src[40]
++ trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63
++ ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40]
++ sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56]
++ shl v17.4h, v17.4h, #2 // 8/2 * src[0]
++ sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40]
++ shl v22.4h, v22.4h, #2 // 8/2 * src[32]
++ mov d23, v1.d[1] // 60 61 62 63
++ ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56]
++ mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16]
++ shl v1.4h, v1.4h, #3 // 16/2 * src[16]
++ mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
++ ssra v17.4h, v17.4h, #1 // 12/2 * src[0]
++ mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
++ ssra v22.4h, v22.4h, #1 // 12/2 * src[32]
++ mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
++ shl v3.4h, v23.4h, #3 // 16/2 * src[48]
++ mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
++ mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
++ mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
++ add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
++ sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
++ sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
++ mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
++ mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
++ add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
++ mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
++ sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
++ add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
++ sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
++ neg v23.4h, v24.4h // +t2
++ sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
++ add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
++ neg v17.4h, v21.4h // +t3
++ sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
++ add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
++ neg v16.4h, v19.4h // -t1
++ neg v27.4h, v2.4h // +t4
++ ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1
++ srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1
++ ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1
++ srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1
++ ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1
++ srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1
++ ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1
++ srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1
++ trn1 v0.2d, v20.2d, v0.2d
++ trn1 v2.2d, v18.2d, v22.2d
++ trn1 v3.2d, v25.2d, v3.2d
++ trn1 v1.2d, v26.2d, v1.2d
++ srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
++ srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
++ srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
++ srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
++ uaddw v0.8h, v0.8h, v5.8b
++ uaddw v2.8h, v2.8h, v6.8b
++ uaddw v3.8h, v3.8h, v7.8b
++ uaddw v1.8h, v1.8h, v4.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.s}[0], [x4], x1
++ st1 {v2.s}[0], [x4], x1
++ st1 {v3.s}[0], [x4], x1
++ st1 {v1.s}[0], [x4], x1
++ st1 {v0.s}[1], [x4], x1
++ st1 {v2.s}[1], [x4], x1
++ st1 {v3.s}[1], [x4], x1
++ st1 {v1.s}[1], [x4]
++ ret
++endfunc
++
++// VC-1 4x4 inverse transform
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_neon, export=1
++ mov x3, #16
++ ldr d0, .Lcoeffs_it4
++ mov x4, x0
++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03
++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13
++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
++ ld1 {v4.d}[0], [x2] // 30 31 32 33
++ ld1 {v5.s}[0], [x0], x1
++ ld1 {v5.s}[1], [x0], x1
++ ld1 {v6.s}[0], [x0], x1
++ trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13
++ trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12
++ ld1 {v6.s}[1], [x0]
++ trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33
++ trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32
++ trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33
++ trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30
++ trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31
++ trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32
++ mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3]
++ mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3]
++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[2]
++ mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
++ mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3]
++ add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2]
++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2]
++ neg v7.4h, v3.4h // -t3/2
++ neg v16.4h, v4.4h // -t4/2
++ ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1
++ ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1
++ ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1
++ ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1
++ srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3
++ srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3
++ srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3
++ srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3
++ trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31
++ trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21
++ trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33
++ trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23
++ trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33
++ trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03
++ trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13
++ trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23
++ mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24]
++ mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24]
++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[16]
++ mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
++ mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24]
++ add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16]
++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16]
++ neg v3.4h, v2.4h // -t4/2
++ neg v7.4h, v4.4h // -t3/2
++ ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1
++ ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1
++ ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1
++ ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1
++ trn1 v0.2d, v4.2d, v3.2d
++ trn1 v1.2d, v2.2d, v7.2d
++ srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
++ srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
++ uaddw v0.8h, v0.8h, v5.8b
++ uaddw v1.8h, v1.8h, v6.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.s}[0], [x4], x1
++ st1 {v0.s}[1], [x4], x1
++ st1 {v1.s}[0], [x4], x1
++ st1 {v1.s}[1], [x4]
++ ret
++endfunc
++
++// VC-1 8x8 inverse transform, DC case
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x8_dc_neon, export=1
++ ldrsh w2, [x2]
++ mov x3, x0
++ ld1 {v0.8b}, [x0], x1
++ ld1 {v1.8b}, [x0], x1
++ ld1 {v2.8b}, [x0], x1
++ add w2, w2, w2, lsl #1
++ ld1 {v3.8b}, [x0], x1
++ ld1 {v4.8b}, [x0], x1
++ add w2, w2, #1
++ ld1 {v5.8b}, [x0], x1
++ asr w2, w2, #1
++ ld1 {v6.8b}, [x0], x1
++ add w2, w2, w2, lsl #1
++ ld1 {v7.8b}, [x0]
++ add w0, w2, #16
++ asr w0, w0, #5
++ dup v16.8h, w0
++ uaddw v0.8h, v16.8h, v0.8b
++ uaddw v1.8h, v16.8h, v1.8b
++ uaddw v2.8h, v16.8h, v2.8b
++ uaddw v3.8h, v16.8h, v3.8b
++ uaddw v4.8h, v16.8h, v4.8b
++ uaddw v5.8h, v16.8h, v5.8b
++ sqxtun v0.8b, v0.8h
++ uaddw v6.8h, v16.8h, v6.8b
++ sqxtun v1.8b, v1.8h
++ uaddw v7.8h, v16.8h, v7.8b
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ sqxtun v4.8b, v4.8h
++ st1 {v0.8b}, [x3], x1
++ sqxtun v0.8b, v5.8h
++ st1 {v1.8b}, [x3], x1
++ sqxtun v1.8b, v6.8h
++ st1 {v2.8b}, [x3], x1
++ sqxtun v2.8b, v7.8h
++ st1 {v3.8b}, [x3], x1
++ st1 {v4.8b}, [x3], x1
++ st1 {v0.8b}, [x3], x1
++ st1 {v1.8b}, [x3], x1
++ st1 {v2.8b}, [x3]
++ ret
++endfunc
++
++// VC-1 8x4 inverse transform, DC case
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_dc_neon, export=1
++ ldrsh w2, [x2]
++ mov x3, x0
++ ld1 {v0.8b}, [x0], x1
++ ld1 {v1.8b}, [x0], x1
++ ld1 {v2.8b}, [x0], x1
++ add w2, w2, w2, lsl #1
++ ld1 {v3.8b}, [x0]
++ add w0, w2, #1
++ asr w0, w0, #1
++ add w0, w0, w0, lsl #4
++ add w0, w0, #64
++ asr w0, w0, #7
++ dup v4.8h, w0
++ uaddw v0.8h, v4.8h, v0.8b
++ uaddw v1.8h, v4.8h, v1.8b
++ uaddw v2.8h, v4.8h, v2.8b
++ uaddw v3.8h, v4.8h, v3.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ st1 {v0.8b}, [x3], x1
++ st1 {v1.8b}, [x3], x1
++ st1 {v2.8b}, [x3], x1
++ st1 {v3.8b}, [x3]
++ ret
++endfunc
++
++// VC-1 4x8 inverse transform, DC case
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_dc_neon, export=1
++ ldrsh w2, [x2]
++ mov x3, x0
++ ld1 {v0.s}[0], [x0], x1
++ ld1 {v1.s}[0], [x0], x1
++ ld1 {v2.s}[0], [x0], x1
++ add w2, w2, w2, lsl #4
++ ld1 {v3.s}[0], [x0], x1
++ add w2, w2, #4
++ asr w2, w2, #3
++ add w2, w2, w2, lsl #1
++ ld1 {v0.s}[1], [x0], x1
++ add w2, w2, #16
++ asr w2, w2, #5
++ dup v4.8h, w2
++ ld1 {v1.s}[1], [x0], x1
++ ld1 {v2.s}[1], [x0], x1
++ ld1 {v3.s}[1], [x0]
++ uaddw v0.8h, v4.8h, v0.8b
++ uaddw v1.8h, v4.8h, v1.8b
++ uaddw v2.8h, v4.8h, v2.8b
++ uaddw v3.8h, v4.8h, v3.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ sqxtun v2.8b, v2.8h
++ sqxtun v3.8b, v3.8h
++ st1 {v0.s}[0], [x3], x1
++ st1 {v1.s}[0], [x3], x1
++ st1 {v2.s}[0], [x3], x1
++ st1 {v3.s}[0], [x3], x1
++ st1 {v0.s}[1], [x3], x1
++ st1 {v1.s}[1], [x3], x1
++ st1 {v2.s}[1], [x3], x1
++ st1 {v3.s}[1], [x3]
++ ret
++endfunc
++
++// VC-1 4x4 inverse transform, DC case
++// On entry:
++// x0 -> array of 8-bit samples, in row-major order
++// x1 = row stride for 8-bit sample array
++// x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++// array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_dc_neon, export=1
++ ldrsh w2, [x2]
++ mov x3, x0
++ ld1 {v0.s}[0], [x0], x1
++ ld1 {v1.s}[0], [x0], x1
++ ld1 {v0.s}[1], [x0], x1
++ add w2, w2, w2, lsl #4
++ ld1 {v1.s}[1], [x0]
++ add w0, w2, #4
++ asr w0, w0, #3
++ add w0, w0, w0, lsl #4
++ add w0, w0, #64
++ asr w0, w0, #7
++ dup v2.8h, w0
++ uaddw v0.8h, v2.8h, v0.8b
++ uaddw v1.8h, v2.8h, v1.8b
++ sqxtun v0.8b, v0.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.s}[0], [x3], x1
++ st1 {v1.s}[0], [x3], x1
++ st1 {v0.s}[1], [x3], x1
++ st1 {v1.s}[1], [x3]
++ ret
++endfunc
++
++.align 5
++.Lcoeffs_it8:
++.quad 0x000F00090003
++.Lcoeffs_it4:
++.quad 0x0011000B0005
++.Lcoeffs:
++.quad 0x00050002
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of lower block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++ sub x3, x0, w1, sxtw #2
++ ldr d0, .Lcoeffs
++ ld1 {v1.s}[0], [x0], x1 // P5
++ ld1 {v2.s}[0], [x3], x1 // P1
++ ld1 {v3.s}[0], [x3], x1 // P2
++ ld1 {v4.s}[0], [x0], x1 // P6
++ ld1 {v5.s}[0], [x3], x1 // P3
++ ld1 {v6.s}[0], [x0], x1 // P7
++ ld1 {v7.s}[0], [x3] // P4
++ ld1 {v16.s}[0], [x0] // P8
++ ushll v17.8h, v1.8b, #1 // 2*P5
++ dup v18.8h, w2 // pq
++ ushll v2.8h, v2.8b, #1 // 2*P1
++ uxtl v3.8h, v3.8b // P2
++ uxtl v4.8h, v4.8b // P6
++ uxtl v19.8h, v5.8b // P3
++ mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2
++ uxtl v3.8h, v6.8b // P7
++ mls v17.4h, v4.4h, v0.h[1] // 2*P5-5*P6
++ ushll v5.8h, v5.8b, #1 // 2*P3
++ uxtl v6.8h, v7.8b // P4
++ mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7
++ uxtl v3.8h, v16.8b // P8
++ mla v2.4h, v19.4h, v0.h[1] // 2*P1-5*P2+5*P3
++ uxtl v1.8h, v1.8b // P5
++ mls v5.4h, v6.4h, v0.h[1] // 2*P3-5*P4
++ mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++ sub v3.4h, v6.4h, v1.4h // P4-P5
++ mls v2.4h, v6.4h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4
++ mla v5.4h, v1.4h, v0.h[1] // 2*P3-5*P4+5*P5
++ mls v5.4h, v4.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6
++ abs v4.4h, v3.4h
++ srshr v7.4h, v17.4h, #3
++ srshr v2.4h, v2.4h, #3
++ sshr v4.4h, v4.4h, #1 // clip
++ srshr v5.4h, v5.4h, #3
++ abs v7.4h, v7.4h // a2
++ sshr v3.4h, v3.4h, #8 // clip_sign
++ abs v2.4h, v2.4h // a1
++ cmeq v16.4h, v4.4h, #0 // test clip == 0
++ abs v17.4h, v5.4h // a0
++ sshr v5.4h, v5.4h, #8 // a0_sign
++ cmhs v19.4h, v2.4h, v7.4h // test a1 >= a2
++ cmhs v18.4h, v17.4h, v18.4h // test a0 >= pq
++ sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign
++ bsl v19.8b, v7.8b, v2.8b // a3
++ orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq
++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0
++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0
++ orr v5.8b, v2.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0
++ mov w0, v5.s[1] // move to gp reg
++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ cmhs v5.4h, v0.4h, v4.4h
++ tbnz w0, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered
++ bsl v5.8b, v4.8b, v0.8b // FFMIN(d, clip)
++ bic v0.8b, v5.8b, v2.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ sqxtun v0.8b, v6.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.s}[0], [x3], x1
++ st1 {v1.s}[0], [x3]
++1: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of right block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++ sub x3, x0, #4 // where to start reading
++ ldr d0, .Lcoeffs
++ ld1 {v1.8b}, [x3], x1
++ sub x0, x0, #1 // where to start writing
++ ld1 {v2.8b}, [x3], x1
++ ld1 {v3.8b}, [x3], x1
++ ld1 {v4.8b}, [x3]
++ dup v5.8h, w2 // pq
++ trn1 v6.8b, v1.8b, v2.8b
++ trn2 v1.8b, v1.8b, v2.8b
++ trn1 v2.8b, v3.8b, v4.8b
++ trn2 v3.8b, v3.8b, v4.8b
++ trn1 v4.4h, v6.4h, v2.4h // P1, P5
++ trn1 v7.4h, v1.4h, v3.4h // P2, P6
++ trn2 v2.4h, v6.4h, v2.4h // P3, P7
++ trn2 v1.4h, v1.4h, v3.4h // P4, P8
++ ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5
++ uxtl v6.8h, v7.8b // P2, P6
++ uxtl v7.8h, v2.8b // P3, P7
++ uxtl v1.8h, v1.8b // P4, P8
++ mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6
++ ushll v2.8h, v2.8b, #1 // 2*P3, 2*P7
++ uxtl v4.8h, v4.8b // P1, P5
++ mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++ mov d6, v6.d[1] // P6
++ mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++ mov d4, v4.d[1] // P5
++ mls v2.4h, v1.4h, v0.h[1] // 2*P3-5*P4
++ mla v2.4h, v4.4h, v0.h[1] // 2*P3-5*P4+5*P5
++ sub v7.4h, v1.4h, v4.4h // P4-P5
++ mls v2.4h, v6.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6
++ srshr v3.8h, v3.8h, #3
++ abs v6.4h, v7.4h
++ sshr v7.4h, v7.4h, #8 // clip_sign
++ srshr v2.4h, v2.4h, #3
++ abs v3.8h, v3.8h // a1, a2
++ sshr v6.4h, v6.4h, #1 // clip
++ mov d16, v3.d[1] // a2
++ abs v17.4h, v2.4h // a0
++ cmeq v18.4h, v6.4h, #0 // test clip == 0
++ sshr v2.4h, v2.4h, #8 // a0_sign
++ cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2
++ cmhs v5.4h, v17.4h, v5.4h // test a0 >= pq
++ sub v2.4h, v7.4h, v2.4h // clip_sign - a0_sign
++ bsl v19.8b, v16.8b, v3.8b // a3
++ orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq
++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0
++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0
++ orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0
++ mov w2, v5.s[1] // move to gp reg
++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ cmhs v5.4h, v0.4h, v6.4h
++ tbnz w2, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered
++ bsl v5.8b, v6.8b, v0.8b // FFMIN(d, clip)
++ bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ mla v4.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ mls v1.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ sqxtun v3.8b, v4.8h
++ sqxtun v2.8b, v1.8h
++ st2 {v2.b, v3.b}[0], [x0], x1
++ st2 {v2.b, v3.b}[1], [x0], x1
++ st2 {v2.b, v3.b}[2], [x0], x1
++ st2 {v2.b, v3.b}[3], [x0]
++1: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of lower block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++ sub x3, x0, w1, sxtw #2
++ ldr d0, .Lcoeffs
++ ld1 {v1.8b}, [x0], x1 // P5
++ movi v2.2d, #0x0000ffff00000000
++ ld1 {v3.8b}, [x3], x1 // P1
++ ld1 {v4.8b}, [x3], x1 // P2
++ ld1 {v5.8b}, [x0], x1 // P6
++ ld1 {v6.8b}, [x3], x1 // P3
++ ld1 {v7.8b}, [x0], x1 // P7
++ ushll v16.8h, v1.8b, #1 // 2*P5
++ ushll v3.8h, v3.8b, #1 // 2*P1
++ ld1 {v17.8b}, [x3] // P4
++ uxtl v4.8h, v4.8b // P2
++ ld1 {v18.8b}, [x0] // P8
++ uxtl v5.8h, v5.8b // P6
++ dup v19.8h, w2 // pq
++ uxtl v20.8h, v6.8b // P3
++ mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2
++ uxtl v4.8h, v7.8b // P7
++ ushll v6.8h, v6.8b, #1 // 2*P3
++ mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6
++ uxtl v7.8h, v17.8b // P4
++ uxtl v17.8h, v18.8b // P8
++ mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7
++ uxtl v1.8h, v1.8b // P5
++ mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3
++ sub v4.8h, v7.8h, v1.8h // P4-P5
++ mls v6.8h, v7.8h, v0.h[1] // 2*P3-5*P4
++ mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++ abs v17.8h, v4.8h
++ sshr v4.8h, v4.8h, #8 // clip_sign
++ mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4
++ sshr v17.8h, v17.8h, #1 // clip
++ mla v6.8h, v1.8h, v0.h[1] // 2*P3-5*P4+5*P5
++ srshr v16.8h, v16.8h, #3
++ mls v6.8h, v5.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6
++ cmeq v5.8h, v17.8h, #0 // test clip == 0
++ srshr v3.8h, v3.8h, #3
++ abs v16.8h, v16.8h // a2
++ abs v3.8h, v3.8h // a1
++ srshr v6.8h, v6.8h, #3
++ cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2
++ abs v20.8h, v6.8h // a0
++ sshr v6.8h, v6.8h, #8 // a0_sign
++ bsl v18.16b, v16.16b, v3.16b // a3
++ cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq
++ sub v4.8h, v4.8h, v6.8h // clip_sign - a0_sign
++ uqsub v6.8h, v20.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0
++ orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq
++ mul v0.8h, v6.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0
++ orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
++ cmtst v2.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either
++ mov w0, v5.s[1] // move to gp reg
++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ mov w2, v5.s[3]
++ orr v2.16b, v3.16b, v2.16b
++ cmhs v3.8h, v0.8h, v17.8h
++ and w0, w0, w2
++ bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
++ tbnz w0, #0, 1f // none of the 8 pixel pairs should be updated in this case
++ bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered
++ mls v7.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ mla v1.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ sqxtun v0.8b, v7.8h
++ sqxtun v1.8b, v1.8h
++ st1 {v0.8b}, [x3], x1
++ st1 {v1.8b}, [x3]
++1: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of right block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++ sub x3, x0, #4 // where to start reading
++ ldr d0, .Lcoeffs
++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]...
++ sub x0, x0, #1 // where to start writing
++ ld1 {v2.8b}, [x3], x1
++ add x4, x0, x1, lsl #2
++ ld1 {v3.8b}, [x3], x1
++ ld1 {v4.8b}, [x3], x1
++ ld1 {v5.8b}, [x3], x1
++ ld1 {v6.8b}, [x3], x1
++ ld1 {v7.8b}, [x3], x1
++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]...
++ ld1 {v17.8b}, [x3]
++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]...
++ trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]...
++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]...
++ dup v4.8h, w2 // pq
++ trn1 v18.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]...
++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]...
++ trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]...
++ trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]...
++ trn1 v20.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]...
++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]...
++ trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]...
++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]...
++ trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]...
++ trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]...
++ trn2 v17.4h, v18.4h, v20.4h // P3[4], P3[5], P3[6], P3[7], P7[4]...
++ trn2 v5.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]...
++ trn1 v7.2s, v6.2s, v3.2s // P1
++ trn1 v18.2s, v19.2s, v16.2s // P2
++ trn2 v3.2s, v6.2s, v3.2s // P5
++ trn2 v6.2s, v19.2s, v16.2s // P6
++ trn1 v16.2s, v2.2s, v17.2s // P3
++ trn2 v2.2s, v2.2s, v17.2s // P7
++ ushll v7.8h, v7.8b, #1 // 2*P1
++ trn1 v17.2s, v1.2s, v5.2s // P4
++ ushll v19.8h, v3.8b, #1 // 2*P5
++ trn2 v1.2s, v1.2s, v5.2s // P8
++ uxtl v5.8h, v18.8b // P2
++ uxtl v6.8h, v6.8b // P6
++ uxtl v18.8h, v16.8b // P3
++ mls v7.8h, v5.8h, v0.h[1] // 2*P1-5*P2
++ uxtl v2.8h, v2.8b // P7
++ ushll v5.8h, v16.8b, #1 // 2*P3
++ mls v19.8h, v6.8h, v0.h[1] // 2*P5-5*P6
++ uxtl v16.8h, v17.8b // P4
++ uxtl v1.8h, v1.8b // P8
++ mla v19.8h, v2.8h, v0.h[1] // 2*P5-5*P6+5*P7
++ uxtl v2.8h, v3.8b // P5
++ mla v7.8h, v18.8h, v0.h[1] // 2*P1-5*P2+5*P3
++ sub v3.8h, v16.8h, v2.8h // P4-P5
++ mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4
++ mls v19.8h, v1.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++ abs v1.8h, v3.8h
++ sshr v3.8h, v3.8h, #8 // clip_sign
++ mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4
++ sshr v1.8h, v1.8h, #1 // clip
++ mla v5.8h, v2.8h, v0.h[1] // 2*P3-5*P4+5*P5
++ srshr v17.8h, v19.8h, #3
++ mls v5.8h, v6.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6
++ cmeq v6.8h, v1.8h, #0 // test clip == 0
++ srshr v7.8h, v7.8h, #3
++ abs v17.8h, v17.8h // a2
++ abs v7.8h, v7.8h // a1
++ srshr v5.8h, v5.8h, #3
++ cmhs v18.8h, v7.8h, v17.8h // test a1 >= a2
++ abs v19.8h, v5.8h // a0
++ sshr v5.8h, v5.8h, #8 // a0_sign
++ bsl v18.16b, v17.16b, v7.16b // a3
++ cmhs v4.8h, v19.8h, v4.8h // test a0 >= pq
++ sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign
++ uqsub v5.8h, v19.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v7.8h, v18.8h, v19.8h // test a3 >= a0
++ orr v4.16b, v6.16b, v4.16b // test clip == 0 || a0 >= pq
++ mul v0.8h, v5.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0
++ orr v5.16b, v4.16b, v7.16b // test clip == 0 || a0 >= pq || a3 >= a0
++ mov w2, v5.s[1] // move to gp reg
++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ mov w3, v5.s[3]
++ cmhs v5.8h, v0.8h, v1.8h
++ and w5, w2, w3
++ bsl v5.16b, v1.16b, v0.16b // FFMIN(d, clip)
++ tbnz w5, #0, 2f // none of the 8 pixel pairs should be updated in this case
++ bic v0.16b, v5.16b, v4.16b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ sqxtun v1.8b, v2.8h
++ sqxtun v0.8b, v16.8h
++ tbnz w2, #0, 1f // none of the first 4 pixel pairs should be updated if so
++ st2 {v0.b, v1.b}[0], [x0], x1
++ st2 {v0.b, v1.b}[1], [x0], x1
++ st2 {v0.b, v1.b}[2], [x0], x1
++ st2 {v0.b, v1.b}[3], [x0]
++1: tbnz w3, #0, 2f // none of the second 4 pixel pairs should be updated if so
++ st2 {v0.b, v1.b}[4], [x4], x1
++ st2 {v0.b, v1.b}[5], [x4], x1
++ st2 {v0.b, v1.b}[6], [x4], x1
++ st2 {v0.b, v1.b}[7], [x4]
++2: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of lower block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++ sub x3, x0, w1, sxtw #2
++ ldr d0, .Lcoeffs
++ ld1 {v1.16b}, [x0], x1 // P5
++ movi v2.2d, #0x0000ffff00000000
++ ld1 {v3.16b}, [x3], x1 // P1
++ ld1 {v4.16b}, [x3], x1 // P2
++ ld1 {v5.16b}, [x0], x1 // P6
++ ld1 {v6.16b}, [x3], x1 // P3
++ ld1 {v7.16b}, [x0], x1 // P7
++ ushll v16.8h, v1.8b, #1 // 2*P5[0..7]
++ ushll v17.8h, v3.8b, #1 // 2*P1[0..7]
++ ld1 {v18.16b}, [x3] // P4
++ uxtl v19.8h, v4.8b // P2[0..7]
++ ld1 {v20.16b}, [x0] // P8
++ uxtl v21.8h, v5.8b // P6[0..7]
++ dup v22.8h, w2 // pq
++ ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15]
++ mls v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
++ ushll2 v19.8h, v1.16b, #1 // 2*P5[8..15]
++ uxtl2 v4.8h, v4.16b // P2[8..15]
++ mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
++ uxtl2 v5.8h, v5.16b // P6[8..15]
++ uxtl v23.8h, v6.8b // P3[0..7]
++ uxtl v24.8h, v7.8b // P7[0..7]
++ mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
++ ushll v4.8h, v6.8b, #1 // 2*P3[0..7]
++ uxtl v25.8h, v18.8b // P4[0..7]
++ mls v19.8h, v5.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
++ uxtl2 v26.8h, v6.16b // P3[8..15]
++ mla v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++ uxtl2 v7.8h, v7.16b // P7[8..15]
++ ushll2 v6.8h, v6.16b, #1 // 2*P3[8..15]
++ mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++ uxtl2 v18.8h, v18.16b // P4[8..15]
++ uxtl v23.8h, v20.8b // P8[0..7]
++ mls v4.8h, v25.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]
++ uxtl v24.8h, v1.8b // P5[0..7]
++ uxtl2 v20.8h, v20.16b // P8[8..15]
++ mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++ uxtl2 v1.8h, v1.16b // P5[8..15]
++ sub v26.8h, v25.8h, v24.8h // P4[0..7]-P5[0..7]
++ mla v19.8h, v7.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++ sub v7.8h, v18.8h, v1.8h // P4[8..15]-P5[8..15]
++ mls v6.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]
++ abs v27.8h, v26.8h
++ sshr v26.8h, v26.8h, #8 // clip_sign[0..7]
++ mls v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++ abs v28.8h, v7.8h
++ sshr v27.8h, v27.8h, #1 // clip[0..7]
++ mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++ sshr v7.8h, v7.8h, #8 // clip_sign[8..15]
++ sshr v23.8h, v28.8h, #1 // clip[8..15]
++ mla v4.8h, v24.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++ cmeq v28.8h, v27.8h, #0 // test clip[0..7] == 0
++ srshr v17.8h, v17.8h, #3
++ mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++ cmeq v29.8h, v23.8h, #0 // test clip[8..15] == 0
++ srshr v16.8h, v16.8h, #3
++ mls v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++ abs v17.8h, v17.8h // a1[0..7]
++ mla v6.8h, v1.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++ srshr v3.8h, v3.8h, #3
++ mls v4.8h, v21.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++ abs v16.8h, v16.8h // a2[0..7]
++ srshr v19.8h, v19.8h, #3
++ mls v6.8h, v5.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++ cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7]
++ abs v3.8h, v3.8h // a1[8..15]
++ srshr v4.8h, v4.8h, #3
++ abs v19.8h, v19.8h // a2[8..15]
++ bsl v5.16b, v16.16b, v17.16b // a3[0..7]
++ srshr v6.8h, v6.8h, #3
++ cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15]
++ abs v17.8h, v4.8h // a0[0..7]
++ sshr v4.8h, v4.8h, #8 // a0_sign[0..7]
++ bsl v16.16b, v19.16b, v3.16b // a3[8..15]
++ uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ abs v19.8h, v6.8h // a0[8..15]
++ cmhs v20.8h, v17.8h, v22.8h // test a0[0..7] >= pq
++ cmhs v5.8h, v5.8h, v17.8h // test a3[0..7] >= a0[0..7]
++ sub v4.8h, v26.8h, v4.8h // clip_sign[0..7] - a0_sign[0..7]
++ sshr v6.8h, v6.8h, #8 // a0_sign[8..15]
++ mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++ uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ orr v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++ cmhs v21.8h, v19.8h, v22.8h // test a0[8..15] >= pq
++ cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15]
++ mul v0.8h, v17.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++ sub v6.8h, v7.8h, v6.8h // clip_sign[8..15] - a0_sign[8..15]
++ orr v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++ ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++ orr v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++ cmtst v17.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either
++ mov w0, v5.s[1] // move to gp reg
++ cmhs v19.8h, v3.8h, v27.8h
++ ushr v0.8h, v0.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++ mov w2, v5.s[3]
++ orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++ orr v16.16b, v20.16b, v17.16b
++ bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
++ cmtst v2.2d, v5.2d, v2.2d
++ cmhs v3.8h, v0.8h, v23.8h
++ mov w4, v5.s[1]
++ mov w5, v5.s[3]
++ and w0, w0, w2
++ bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++ orr v2.16b, v7.16b, v2.16b
++ bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
++ mls v25.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++ and w2, w4, w5
++ bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++ mla v24.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++ and w0, w0, w2
++ mls v18.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++ sqxtun v2.8b, v25.8h
++ tbnz w0, #0, 1f // none of the 16 pixel pairs should be updated in this case
++ mla v1.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++ sqxtun v0.8b, v24.8h
++ sqxtun2 v2.16b, v18.8h
++ sqxtun2 v0.16b, v1.8h
++ st1 {v2.16b}, [x3], x1
++ st1 {v0.16b}, [x3]
++1: ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++// x0 -> top-left pel of right block
++// x1 = row stride, bytes
++// w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++ sub x3, x0, #4 // where to start reading
++ ldr d0, .Lcoeffs
++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]...
++ sub x0, x0, #1 // where to start writing
++ ld1 {v2.8b}, [x3], x1
++ add x4, x0, x1, lsl #3
++ ld1 {v3.8b}, [x3], x1
++ add x5, x0, x1, lsl #2
++ ld1 {v4.8b}, [x3], x1
++ add x6, x4, x1, lsl #2
++ ld1 {v5.8b}, [x3], x1
++ ld1 {v6.8b}, [x3], x1
++ ld1 {v7.8b}, [x3], x1
++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]...
++ ld1 {v17.8b}, [x3], x1
++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]...
++ ld1 {v2.8b}, [x3], x1
++ trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]...
++ ld1 {v19.8b}, [x3], x1
++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]...
++ ld1 {v4.8b}, [x3], x1
++ trn1 v20.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]...
++ ld1 {v21.8b}, [x3], x1
++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]...
++ ld1 {v6.8b}, [x3], x1
++ trn1 v22.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]...
++ ld1 {v23.8b}, [x3], x1
++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]...
++ ld1 {v17.8b}, [x3], x1
++ trn1 v24.8b, v2.8b, v19.8b // P1[8], P1[9], P3[8]...
++ ld1 {v25.8b}, [x3]
++ trn2 v2.8b, v2.8b, v19.8b // P2[8], P2[9], P4[8]...
++ trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]...
++ trn1 v26.8b, v4.8b, v21.8b // P1[10], P1[11], P3[10]...
++ trn2 v4.8b, v4.8b, v21.8b // P2[10], P2[11], P4[10]...
++ trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]...
++ trn1 v27.4h, v20.4h, v22.4h // P1[4], P1[5], P1[6], P1[7], P5[4]...
++ trn1 v28.8b, v6.8b, v23.8b // P1[12], P1[13], P3[12]...
++ trn2 v6.8b, v6.8b, v23.8b // P2[12], P2[13], P4[12]...
++ trn1 v23.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]...
++ trn1 v29.4h, v24.4h, v26.4h // P1[8], P1[9], P1[10], P1[11], P5[8]...
++ trn1 v30.8b, v17.8b, v25.8b // P1[14], P1[15], P3[14]...
++ trn2 v17.8b, v17.8b, v25.8b // P2[14], P2[15], P4[14]...
++ trn1 v25.4h, v2.4h, v4.4h // P2[8], P2[9], P2[10], P2[11], P6[8]...
++ trn1 v31.2s, v19.2s, v27.2s // P1[0..7]
++ trn2 v19.2s, v19.2s, v27.2s // P5[0..7]
++ trn1 v27.2s, v21.2s, v23.2s // P2[0..7]
++ trn2 v21.2s, v21.2s, v23.2s // P6[0..7]
++ trn1 v23.4h, v28.4h, v30.4h // P1[12], P1[13], P1[14], P1[15], P5[12]...
++ trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]...
++ trn1 v18.4h, v6.4h, v17.4h // P2[12], P2[13], P2[14], P2[15], P6[12]...
++ trn2 v20.4h, v20.4h, v22.4h // P3[4], P3[5], P3[6], P3[7], P7[4]...
++ trn2 v22.4h, v24.4h, v26.4h // P3[8], P3[9], P3[10], P3[11], P7[8]...
++ trn1 v24.2s, v29.2s, v23.2s // P1[8..15]
++ trn2 v23.2s, v29.2s, v23.2s // P5[8..15]
++ trn1 v26.2s, v25.2s, v18.2s // P2[8..15]
++ trn2 v18.2s, v25.2s, v18.2s // P6[8..15]
++ trn2 v25.4h, v28.4h, v30.4h // P3[12], P3[13], P3[14], P3[15], P7[12]...
++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]...
++ trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]...
++ trn2 v2.4h, v2.4h, v4.4h // P4[8], P4[9], P4[10], P4[11], P8[8]...
++ trn2 v4.4h, v6.4h, v17.4h // P4[12], P4[13], P4[14], P4[15], P8[12]...
++ ushll v5.8h, v31.8b, #1 // 2*P1[0..7]
++ ushll v6.8h, v19.8b, #1 // 2*P5[0..7]
++ trn1 v7.2s, v16.2s, v20.2s // P3[0..7]
++ uxtl v17.8h, v27.8b // P2[0..7]
++ trn2 v16.2s, v16.2s, v20.2s // P7[0..7]
++ uxtl v20.8h, v21.8b // P6[0..7]
++ trn1 v21.2s, v22.2s, v25.2s // P3[8..15]
++ ushll v24.8h, v24.8b, #1 // 2*P1[8..15]
++ trn2 v22.2s, v22.2s, v25.2s // P7[8..15]
++ ushll v25.8h, v23.8b, #1 // 2*P5[8..15]
++ trn1 v27.2s, v1.2s, v3.2s // P4[0..7]
++ uxtl v26.8h, v26.8b // P2[8..15]
++ mls v5.8h, v17.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
++ uxtl v17.8h, v18.8b // P6[8..15]
++ mls v6.8h, v20.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
++ trn1 v18.2s, v2.2s, v4.2s // P4[8..15]
++ uxtl v28.8h, v7.8b // P3[0..7]
++ mls v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
++ uxtl v16.8h, v16.8b // P7[0..7]
++ uxtl v26.8h, v21.8b // P3[8..15]
++ mls v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
++ uxtl v22.8h, v22.8b // P7[8..15]
++ ushll v7.8h, v7.8b, #1 // 2*P3[0..7]
++ uxtl v27.8h, v27.8b // P4[0..7]
++ trn2 v1.2s, v1.2s, v3.2s // P8[0..7]
++ ushll v3.8h, v21.8b, #1 // 2*P3[8..15]
++ trn2 v2.2s, v2.2s, v4.2s // P8[8..15]
++ uxtl v4.8h, v18.8b // P4[8..15]
++ mla v5.8h, v28.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++ uxtl v1.8h, v1.8b // P8[0..7]
++ mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++ uxtl v2.8h, v2.8b // P8[8..15]
++ uxtl v16.8h, v19.8b // P5[0..7]
++ mla v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++ uxtl v18.8h, v23.8b // P5[8..15]
++ dup v19.8h, w2 // pq
++ mla v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++ sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7]
++ sub v22.8h, v4.8h, v18.8h // P4[8..15]-P5[8..15]
++ mls v7.8h, v27.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]
++ abs v23.8h, v21.8h
++ mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]
++ abs v26.8h, v22.8h
++ sshr v21.8h, v21.8h, #8 // clip_sign[0..7]
++ mls v5.8h, v27.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++ sshr v23.8h, v23.8h, #1 // clip[0..7]
++ sshr v26.8h, v26.8h, #1 // clip[8..15]
++ mls v6.8h, v1.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++ sshr v1.8h, v22.8h, #8 // clip_sign[8..15]
++ cmeq v22.8h, v23.8h, #0 // test clip[0..7] == 0
++ mls v24.8h, v4.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++ cmeq v28.8h, v26.8h, #0 // test clip[8..15] == 0
++ srshr v5.8h, v5.8h, #3
++ mls v25.8h, v2.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++ srshr v2.8h, v6.8h, #3
++ mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++ srshr v6.8h, v24.8h, #3
++ mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++ abs v5.8h, v5.8h // a1[0..7]
++ srshr v24.8h, v25.8h, #3
++ mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++ abs v2.8h, v2.8h // a2[0..7]
++ abs v6.8h, v6.8h // a1[8..15]
++ mls v7.8h, v20.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++ abs v17.8h, v24.8h // a2[8..15]
++ cmhs v20.8h, v5.8h, v2.8h // test a1[0..7] >= a2[0..7]
++ srshr v3.8h, v3.8h, #3
++ cmhs v24.8h, v6.8h, v17.8h // test a1[8..15] >= a2[8.15]
++ srshr v7.8h, v7.8h, #3
++ bsl v20.16b, v2.16b, v5.16b // a3[0..7]
++ abs v2.8h, v3.8h // a0[8..15]
++ sshr v3.8h, v3.8h, #8 // a0_sign[8..15]
++ bsl v24.16b, v17.16b, v6.16b // a3[8..15]
++ abs v5.8h, v7.8h // a0[0..7]
++ sshr v6.8h, v7.8h, #8 // a0_sign[0..7]
++ cmhs v7.8h, v2.8h, v19.8h // test a0[8..15] >= pq
++ sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15]
++ uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v2.8h, v24.8h, v2.8h // test a3[8..15] >= a0[8..15]
++ uqsub v17.8h, v5.8h, v20.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ cmhs v19.8h, v5.8h, v19.8h // test a0[0..7] >= pq
++ orr v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++ sub v6.8h, v21.8h, v6.8h // clip_sign[0..7] - a0_sign[0..7]
++ mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++ cmhs v5.8h, v20.8h, v5.8h // test a3[0..7] >= a0[0..7]
++ orr v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++ mul v0.8h, v17.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++ orr v2.16b, v7.16b, v2.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++ orr v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++ ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++ mov w7, v2.s[1]
++ mov w8, v2.s[3]
++ ushr v0.8h, v0.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++ mov w2, v5.s[1] // move to gp reg
++ cmhs v2.8h, v3.8h, v26.8h
++ mov w3, v5.s[3]
++ cmhs v5.8h, v0.8h, v23.8h
++ bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
++ and w9, w7, w8
++ bsl v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
++ and w10, w2, w3
++ bic v0.16b, v2.16b, v7.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++ and w9, w10, w9
++ bic v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++ mls v4.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++ tbnz w9, #0, 4f // none of the 16 pixel pairs should be updated in this case
++ mls v27.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++ mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++ sqxtun v2.8b, v4.8h
++ mla v18.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++ sqxtun v0.8b, v27.8h
++ sqxtun v1.8b, v16.8h
++ sqxtun v3.8b, v18.8h
++ tbnz w2, #0, 1f
++ st2 {v0.b, v1.b}[0], [x0], x1
++ st2 {v0.b, v1.b}[1], [x0], x1
++ st2 {v0.b, v1.b}[2], [x0], x1
++ st2 {v0.b, v1.b}[3], [x0]
++1: tbnz w3, #0, 2f
++ st2 {v0.b, v1.b}[4], [x5], x1
++ st2 {v0.b, v1.b}[5], [x5], x1
++ st2 {v0.b, v1.b}[6], [x5], x1
++ st2 {v0.b, v1.b}[7], [x5]
++2: tbnz w7, #0, 3f
++ st2 {v2.b, v3.b}[0], [x4], x1
++ st2 {v2.b, v3.b}[1], [x4], x1
++ st2 {v2.b, v3.b}[2], [x4], x1
++ st2 {v2.b, v3.b}[3], [x4]
++3: tbnz w8, #0, 4f
++ st2 {v2.b, v3.b}[4], [x6], x1
++ st2 {v2.b, v3.b}[5], [x6], x1
++ st2 {v2.b, v3.b}[6], [x6], x1
++ st2 {v2.b, v3.b}[7], [x6]
++4: ret
++endfunc
++
++// Copy at most the specified number of bytes from source to destination buffer,
++// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
++// On entry:
++// x0 -> source buffer
++// w1 = max number of bytes to copy
++// x2 -> destination buffer, optimally 8-byte aligned
++// On exit:
++// w0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++ // Offset by 80 to screen out cases that are too short for us to handle,
++ // and also make it easy to test for loop termination, or to determine
++ // whether we need an odd number of half-iterations of the loop.
++ subs w1, w1, #80
++ b.mi 90f
++
++ // Set up useful constants
++ movi v20.4s, #3, lsl #24
++ movi v21.4s, #3, lsl #16
++
++ tst w1, #32
++ b.ne 1f
++
++ ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48
++ ext v25.16b, v0.16b, v1.16b, #1
++ ext v26.16b, v0.16b, v1.16b, #2
++ ext v27.16b, v0.16b, v1.16b, #3
++ ext v29.16b, v1.16b, v2.16b, #1
++ ext v30.16b, v1.16b, v2.16b, #2
++ ext v31.16b, v1.16b, v2.16b, #3
++ bic v24.16b, v0.16b, v20.16b
++ bic v25.16b, v25.16b, v20.16b
++ bic v26.16b, v26.16b, v20.16b
++ bic v27.16b, v27.16b, v20.16b
++ bic v28.16b, v1.16b, v20.16b
++ bic v29.16b, v29.16b, v20.16b
++ bic v30.16b, v30.16b, v20.16b
++ bic v31.16b, v31.16b, v20.16b
++ eor v24.16b, v24.16b, v21.16b
++ eor v25.16b, v25.16b, v21.16b
++ eor v26.16b, v26.16b, v21.16b
++ eor v27.16b, v27.16b, v21.16b
++ eor v28.16b, v28.16b, v21.16b
++ eor v29.16b, v29.16b, v21.16b
++ eor v30.16b, v30.16b, v21.16b
++ eor v31.16b, v31.16b, v21.16b
++ cmeq v24.4s, v24.4s, #0
++ cmeq v25.4s, v25.4s, #0
++ cmeq v26.4s, v26.4s, #0
++ cmeq v27.4s, v27.4s, #0
++ add w1, w1, #32
++ b 3f
++
++1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48
++ ext v25.16b, v3.16b, v4.16b, #1
++ ext v26.16b, v3.16b, v4.16b, #2
++ ext v27.16b, v3.16b, v4.16b, #3
++ ext v29.16b, v4.16b, v5.16b, #1
++ ext v30.16b, v4.16b, v5.16b, #2
++ ext v31.16b, v4.16b, v5.16b, #3
++ bic v24.16b, v3.16b, v20.16b
++ bic v25.16b, v25.16b, v20.16b
++ bic v26.16b, v26.16b, v20.16b
++ bic v27.16b, v27.16b, v20.16b
++ bic v28.16b, v4.16b, v20.16b
++ bic v29.16b, v29.16b, v20.16b
++ bic v30.16b, v30.16b, v20.16b
++ bic v31.16b, v31.16b, v20.16b
++ eor v24.16b, v24.16b, v21.16b
++ eor v25.16b, v25.16b, v21.16b
++ eor v26.16b, v26.16b, v21.16b
++ eor v27.16b, v27.16b, v21.16b
++ eor v28.16b, v28.16b, v21.16b
++ eor v29.16b, v29.16b, v21.16b
++ eor v30.16b, v30.16b, v21.16b
++ eor v31.16b, v31.16b, v21.16b
++ cmeq v24.4s, v24.4s, #0
++ cmeq v25.4s, v25.4s, #0
++ cmeq v26.4s, v26.4s, #0
++ cmeq v27.4s, v27.4s, #0
++ // Drop through...
++2: mov v0.16b, v5.16b
++ ld1 {v1.16b, v2.16b}, [x0], #32
++ cmeq v28.4s, v28.4s, #0
++ cmeq v29.4s, v29.4s, #0
++ cmeq v30.4s, v30.4s, #0
++ cmeq v31.4s, v31.4s, #0
++ orr v24.16b, v24.16b, v25.16b
++ orr v26.16b, v26.16b, v27.16b
++ orr v28.16b, v28.16b, v29.16b
++ orr v30.16b, v30.16b, v31.16b
++ ext v25.16b, v0.16b, v1.16b, #1
++ orr v22.16b, v24.16b, v26.16b
++ ext v26.16b, v0.16b, v1.16b, #2
++ ext v27.16b, v0.16b, v1.16b, #3
++ ext v29.16b, v1.16b, v2.16b, #1
++ orr v23.16b, v28.16b, v30.16b
++ ext v30.16b, v1.16b, v2.16b, #2
++ ext v31.16b, v1.16b, v2.16b, #3
++ bic v24.16b, v0.16b, v20.16b
++ bic v25.16b, v25.16b, v20.16b
++ bic v26.16b, v26.16b, v20.16b
++ orr v22.16b, v22.16b, v23.16b
++ bic v27.16b, v27.16b, v20.16b
++ bic v28.16b, v1.16b, v20.16b
++ bic v29.16b, v29.16b, v20.16b
++ bic v30.16b, v30.16b, v20.16b
++ bic v31.16b, v31.16b, v20.16b
++ addv s22, v22.4s
++ eor v24.16b, v24.16b, v21.16b
++ eor v25.16b, v25.16b, v21.16b
++ eor v26.16b, v26.16b, v21.16b
++ eor v27.16b, v27.16b, v21.16b
++ eor v28.16b, v28.16b, v21.16b
++ mov w3, v22.s[0]
++ eor v29.16b, v29.16b, v21.16b
++ eor v30.16b, v30.16b, v21.16b
++ eor v31.16b, v31.16b, v21.16b
++ cmeq v24.4s, v24.4s, #0
++ cmeq v25.4s, v25.4s, #0
++ cmeq v26.4s, v26.4s, #0
++ cmeq v27.4s, v27.4s, #0
++ cbnz w3, 90f
++ st1 {v3.16b, v4.16b}, [x2], #32
++3: mov v3.16b, v2.16b
++ ld1 {v4.16b, v5.16b}, [x0], #32
++ cmeq v28.4s, v28.4s, #0
++ cmeq v29.4s, v29.4s, #0
++ cmeq v30.4s, v30.4s, #0
++ cmeq v31.4s, v31.4s, #0
++ orr v24.16b, v24.16b, v25.16b
++ orr v26.16b, v26.16b, v27.16b
++ orr v28.16b, v28.16b, v29.16b
++ orr v30.16b, v30.16b, v31.16b
++ ext v25.16b, v3.16b, v4.16b, #1
++ orr v22.16b, v24.16b, v26.16b
++ ext v26.16b, v3.16b, v4.16b, #2
++ ext v27.16b, v3.16b, v4.16b, #3
++ ext v29.16b, v4.16b, v5.16b, #1
++ orr v23.16b, v28.16b, v30.16b
++ ext v30.16b, v4.16b, v5.16b, #2
++ ext v31.16b, v4.16b, v5.16b, #3
++ bic v24.16b, v3.16b, v20.16b
++ bic v25.16b, v25.16b, v20.16b
++ bic v26.16b, v26.16b, v20.16b
++ orr v22.16b, v22.16b, v23.16b
++ bic v27.16b, v27.16b, v20.16b
++ bic v28.16b, v4.16b, v20.16b
++ bic v29.16b, v29.16b, v20.16b
++ bic v30.16b, v30.16b, v20.16b
++ bic v31.16b, v31.16b, v20.16b
++ addv s22, v22.4s
++ eor v24.16b, v24.16b, v21.16b
++ eor v25.16b, v25.16b, v21.16b
++ eor v26.16b, v26.16b, v21.16b
++ eor v27.16b, v27.16b, v21.16b
++ eor v28.16b, v28.16b, v21.16b
++ mov w3, v22.s[0]
++ eor v29.16b, v29.16b, v21.16b
++ eor v30.16b, v30.16b, v21.16b
++ eor v31.16b, v31.16b, v21.16b
++ cmeq v24.4s, v24.4s, #0
++ cmeq v25.4s, v25.4s, #0
++ cmeq v26.4s, v26.4s, #0
++ cmeq v27.4s, v27.4s, #0
++ cbnz w3, 91f
++ st1 {v0.16b, v1.16b}, [x2], #32
++ subs w1, w1, #64
++ b.pl 2b
++
++90: add w0, w1, #80
++ ret
++
++91: sub w1, w1, #32
++ b 90b
++endfunc
+--- a/libavcodec/allcodecs.c
++++ b/libavcodec/allcodecs.c
+@@ -149,6 +149,7 @@ extern AVCodec ff_hap_decoder;
+ extern AVCodec ff_hevc_decoder;
+ extern AVCodec ff_hevc_qsv_decoder;
+ extern AVCodec ff_hevc_rkmpp_decoder;
++extern AVCodec ff_hevc_rpi_decoder;
+ extern AVCodec ff_hevc_v4l2m2m_decoder;
+ extern AVCodec ff_hnm4_video_decoder;
+ extern AVCodec ff_hq_hqa_decoder;
+@@ -890,6 +891,41 @@ static enum AVCodecID remap_deprecated_c
+ }
+ }
+
++static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
++{
++ const enum AVPixelFormat *pf = p->pix_fmts;
++
++ // Assume good if we lack info
++ if (pf == NULL)
++ return 1;
++ if (fmt == AV_PIX_FMT_NONE)
++ return 0;
++
++ for (; *pf != AV_PIX_FMT_NONE; ++pf) {
++ if (*pf == fmt)
++ return 1;
++ }
++ return 0;
++}
++
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
++{
++ const AVCodec *p, *experimental = NULL;
++ void *i = 0;
++
++ id= remap_deprecated_codec_id(id);
++ while ((p = av_codec_iterate(&i))) {
++ if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
++ if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
++ experimental = p;
++ } else
++ return (AVCodec *)p;
++ }
++ p = p->next;
++ }
++ return (AVCodec *)experimental;
++}
++
+ static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
+ {
+ const AVCodec *p, *experimental = NULL;
+--- a/libavcodec/arm/Makefile
++++ b/libavcodec/arm/Makefile
+@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) +
+ arm/sbrdsp_init_arm.o
+ OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o
+ OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \
++ arm/rpi_hevcpred_init_arm.o
+ OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
+ OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
+ OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o
+@@ -140,10 +142,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) +
+ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
+ NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
+ NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
++ arm/hevcdsp_idct_neon.o \
+ arm/hevcdsp_deblock_neon.o \
+ arm/hevcdsp_idct_neon.o \
+ arm/hevcdsp_qpel_neon.o \
+ arm/hevcdsp_sao_neon.o
++NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_neon.o \
++ arm/rpi_hevc_misc_neon.o \
++ arm/rpi_hevcdsp_deblock_neon.o \
++ arm/rpi_hevcdsp_idct_neon.o \
++ arm/rpi_hevcdsp_res8_neon.o \
++ arm/rpi_hevcdsp_res16_neon.o \
++ arm/rpi_hevcdsp_sao_neon.o \
++ arm/rpi_hevcpred_init_neon.o \
++ arm/rpi_hevcpred_intra_angular_neon.o \
++ arm/rpi_hevcpred_intra_dc_neon.o \
++ arm/rpi_hevcpred_intra_filter_neon.o \
++ arm/rpi_hevcpred_intra_hv_neon.o \
++ arm/rpi_hevcpred_intra_planar_neon.o
+ NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
+ NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
+ arm/rv40dsp_neon.o
+--- a/libavcodec/arm/cabac.h
++++ b/libavcodec/arm/cabac.h
+@@ -26,83 +26,209 @@
+ #include "libavutil/internal.h"
+ #include "libavcodec/cabac.h"
+
++
+ #define get_cabac_inline get_cabac_inline_arm
+ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
+- uint8_t *const state)
++ uint8_t *state)
+ {
+- int bit;
+- void *reg_b, *reg_c, *tmp;
++ const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
++ int bit, ptr, low, tmp1, tmp2;
++ __asm__ volatile (
++ "ldr %[bit], [%[c], %[range_off]] \n\t"
++ "ldrb %[ptr], [%[state]] \n\t"
++ "sub %[tmp1], %[mlps_tables], %[lps_off] \n\t"
++ "and %[tmp2], %[bit], #0xc0 \n\t"
++ "add %[tmp1], %[tmp1], %[ptr] \n\t"
++ "ldr %[low], [%[c], %[low_off]] \n\t"
++ "ldrb %[tmp2], [%[tmp1], %[tmp2], lsl #1] \n\t"
++ "sub %[bit], %[bit], %[tmp2] \n\t"
++ "mov %[tmp1], %[bit] \n\t"
++ "cmp %[low], %[bit], lsl #17 \n\t"
++ "itt ge \n\t"
++ "movge %[tmp1], %[tmp2] \n\t"
++ "mvnge %[ptr], %[ptr] \n\t"
++ "clz %[tmp2], %[tmp1] \n\t"
++ "it ge \n\t"
++ "subge %[low], %[low], %[bit], lsl #17 \n\t"
++ "sub %[tmp2], %[tmp2], #23 \n\t"
++ "and %[bit], %[ptr], #1 \n\t"
++ "ldrb %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
++ "lsl %[low], %[low], %[tmp2] \n\t"
++ "lsls %[ptr], %[low], #16 \n\t"
++ "bne 1f \n\t"
++ "ldr %[ptr], [%[c], %[ptr_off]] \n\t"
++ "lsl %[tmp2], %[tmp1], %[tmp2] \n\t"
++#if UNCHECKED_BITSTREAM_READER
++ "strb %[mlps_tables], [%[state]] \n\t"
++ "rbit %[state], %[low] \n\t"
++ "ldrh %[tmp1], [%[ptr]], #2 \n\t"
++#else
++ "ldr %[tmp1], [%[c], %[end_off]] \n\t"
++ "strb %[mlps_tables], [%[state]] \n\t"
++ "rbit %[state], %[low] \n\t"
++ "cmp %[tmp1], %[ptr] \n\t"
++#if CONFIG_THUMB
++ "it cs \n\t"
++ "ldrhcs %[tmp1], [%[ptr]], #2 \n\t"
++#else
++ "ldrcsh %[tmp1], [%[ptr]], #2 \n\t"
++#endif
++#endif
++ "clz %[state], %[state] \n\t"
++ "movw %[mlps_tables], #0xffff \n\t"
++ "sub %[state], %[state], #16 \n\t"
++ "str %[tmp2], [%[c], %[range_off]] \n\t"
++ "rev %[tmp1], %[tmp1] \n\t"
++ "str %[ptr], [%[c], %[ptr_off]] \n\t"
++ "lsr %[tmp1], %[tmp1], #15 \n\t"
++ "sub %[tmp1], %[tmp1], %[mlps_tables] \n\t"
++#if CONFIG_THUMB
++ "lsl %[tmp1], %[tmp1], %[state] \n\t"
++ "add %[low], %[low], %[tmp1] \n\t"
++#else
++ "add %[low], %[low], %[tmp1], lsl %[state] \n\t"
++#endif
++ "str %[low], [%[c], %[low_off]] \n\t"
++ "b 2f \n\t"
++ "1: \n\t"
++ "strb %[mlps_tables], [%[state]] \n\t"
++ "lsl %[tmp1], %[tmp1], %[tmp2] \n\t"
++ "str %[low], [%[c], %[low_off]] \n\t"
++ "str %[tmp1], [%[c], %[range_off]] \n\t"
++ "2: \n\t"
++ : // Outputs
++ [state]"+r"(state),
++ [mlps_tables]"+r"(mlps_tables),
++ [bit]"=&r"(bit),
++ [ptr]"=&r"(ptr),
++ [low]"=&r"(low),
++ [tmp1]"=&r"(tmp1),
++ [tmp2]"=&r"(tmp2)
++ : // Inputs
++ [c]"r"(c),
++ [low_off]"J"(offsetof(CABACContext, low)),
++ [range_off]"J"(offsetof(CABACContext, range)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++ [end_off]"J"(offsetof(CABACContext, bytestream_end)),
++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++ : // Clobbers
++ "cc", "memory"
++ );
++ return bit;
++}
+
+- __asm__ volatile(
+- "ldrb %[bit] , [%[state]] \n\t"
+- "add %[r_b] , %[tables] , %[lps_off] \n\t"
+- "mov %[tmp] , %[range] \n\t"
+- "and %[range] , %[range] , #0xC0 \n\t"
+- "add %[r_b] , %[r_b] , %[bit] \n\t"
+- "ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t"
+- "add %[r_b] , %[tables] , %[norm_off] \n\t"
+- "sub %[r_c] , %[tmp] , %[range] \n\t"
+- "lsl %[tmp] , %[r_c] , #17 \n\t"
+- "cmp %[tmp] , %[low] \n\t"
+- "it gt \n\t"
+- "movgt %[range] , %[r_c] \n\t"
+- "itt cc \n\t"
+- "mvncc %[bit] , %[bit] \n\t"
+- "subcc %[low] , %[low] , %[tmp] \n\t"
+- "add %[r_c] , %[tables] , %[mlps_off] \n\t"
+- "ldrb %[tmp] , [%[r_b], %[range]] \n\t"
+- "ldrb %[r_b] , [%[r_c], %[bit]] \n\t"
+- "lsl %[low] , %[low] , %[tmp] \n\t"
+- "lsl %[range] , %[range] , %[tmp] \n\t"
+- "uxth %[r_c] , %[low] \n\t"
+- "strb %[r_b] , [%[state]] \n\t"
+- "tst %[r_c] , %[r_c] \n\t"
+- "bne 2f \n\t"
+- "ldr %[r_c] , [%[c], %[byte]] \n\t"
++#define get_cabac_bypass get_cabac_bypass_arm
++static inline int get_cabac_bypass_arm(CABACContext * const c)
++{
++ uint32_t low = c->low, range, ptr, tmp;
++ int rv;
++ __asm volatile (
++ "ldr %[range] , [%[c], %[range_off]] \n\t"
++ "mov %[rv] , #0 \n\t"
++ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t"
++ "lsl %[low] , #1 \n\t"
++#if !UNCHECKED_BITSTREAM_READER
++ "ldr %[tmp] , [%[c], %[end_off]] \n\t"
++#endif
++ "cmp %[low] , %[range], lsl #17 \n\t"
++ "itt cs \n\t"
++ "subcs %[low] , %[low], %[range], lsl #17 \n\t"
++ "movcs %[rv] , #1 \n\t"
+ #if UNCHECKED_BITSTREAM_READER
+- "ldrh %[tmp] , [%[r_c]] \n\t"
+- "add %[r_c] , %[r_c] , #2 \n\t"
+- "str %[r_c] , [%[c], %[byte]] \n\t"
+-#else
+- "ldr %[r_b] , [%[c], %[end]] \n\t"
+- "ldrh %[tmp] , [%[r_c]] \n\t"
+- "cmp %[r_c] , %[r_b] \n\t"
+- "itt lt \n\t"
+- "addlt %[r_c] , %[r_c] , #2 \n\t"
+- "strlt %[r_c] , [%[c], %[byte]] \n\t"
+-#endif
+- "sub %[r_c] , %[low] , #1 \n\t"
+- "add %[r_b] , %[tables] , %[norm_off] \n\t"
+- "eor %[r_c] , %[low] , %[r_c] \n\t"
+- "rev %[tmp] , %[tmp] \n\t"
+- "lsr %[r_c] , %[r_c] , #15 \n\t"
+- "lsr %[tmp] , %[tmp] , #15 \n\t"
+- "ldrb %[r_c] , [%[r_b], %[r_c]] \n\t"
+- "movw %[r_b] , #0xFFFF \n\t"
+- "sub %[tmp] , %[tmp] , %[r_b] \n\t"
+- "rsb %[r_c] , %[r_c] , #7 \n\t"
+- "lsl %[tmp] , %[tmp] , %[r_c] \n\t"
+- "add %[low] , %[low] , %[tmp] \n\t"
+- "2: \n\t"
+- : [bit]"=&r"(bit),
+- [low]"+&r"(c->low),
+- [range]"+&r"(c->range),
+- [r_b]"=&r"(reg_b),
+- [r_c]"=&r"(reg_c),
+- [tmp]"=&r"(tmp)
+- : [c]"r"(c),
+- [state]"r"(state),
+- [tables]"r"(ff_h264_cabac_tables),
+- [byte]"M"(offsetof(CABACContext, bytestream)),
+- [end]"M"(offsetof(CABACContext, bytestream_end)),
+- [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
+- [lps_off]"I"(H264_LPS_RANGE_OFFSET),
+- [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
+- : "memory", "cc"
+- );
++ "ldrh %[tmp] , [%[ptr]], #2 \n\t"
++#else
++ "cmp %[tmp] , %[ptr] \n\t"
++#if CONFIG_THUMB
++ "it cs \n\t"
++ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t"
++#else
++ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t"
++#endif
++#endif
++ "lsls %[range] , %[low], #16 \n\t"
++ "bne 1f \n\t"
+
+- return bit & 1;
++ "str %[ptr] , [%[c], %[ptr_off]] \n\t"
++ "rev %[tmp] , %[tmp] \n\t"
++ "add %[low] , %[low], %[tmp], lsr #15 \n\t"
++ "movw %[tmp] , 0xFFFF \n\t"
++ "sub %[low] , %[tmp] \n\t"
++ "1: \n\t"
++ "str %[low] , [%[c], %[low_off]] \n\t"
++ : // Outputs
++ [rv]"=&r"(rv),
++ [low]"+r"(low),
++ [range]"=&r"(range),
++ [ptr]"=&r"(ptr),
++ [tmp]"=&r"(tmp)
++ : // Inputs
++ [c]"r"(c),
++ [low_off]"J"(offsetof(CABACContext, low)),
++ [range_off]"J"(offsetof(CABACContext, range)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++ [end_off]"J"(offsetof(CABACContext, bytestream_end))
++ : // Clobbers
++ "memory", "cc"
++ );
++ return rv;
+ }
++
++
++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
++{
++ uint32_t low = c->low, range, ptr, tmp;
++ __asm volatile (
++ "ldr %[range] , [%[c], %[range_off]] \n\t"
++ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t"
++ "lsl %[low] , #1 \n\t"
++#if !UNCHECKED_BITSTREAM_READER
++ "ldr %[tmp] , [%[c], %[end_off]] \n\t"
++#endif
++ "cmp %[low] , %[range], lsl #17 \n\t"
++ "it cs \n\t"
++ "subcs %[low] , %[low], %[range], lsl #17 \n\t"
++ "it cc \n\t"
++ "rsbcc %[rv] , %[rv], #0 \n\t"
++#if UNCHECKED_BITSTREAM_READER
++ "ldrh %[tmp] , [%[ptr]], #2 \n\t"
++#else
++ "cmp %[tmp] , %[ptr] \n\t"
++#if CONFIG_THUMB
++ "it cs \n\t"
++ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t"
++#else
++ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t"
++#endif
++#endif
++ "lsls %[range] , %[low], #16 \n\t"
++ "bne 1f \n\t"
++
++ "str %[ptr] , [%[c], %[ptr_off]] \n\t"
++ "rev %[tmp] , %[tmp] \n\t"
++ "add %[low] , %[low], %[tmp], lsr #15 \n\t"
++ "movw %[tmp] , 0xFFFF \n\t"
++ "sub %[low] , %[tmp] \n\t"
++ "1: \n\t"
++ "str %[low] , [%[c], %[low_off]] \n\t"
++ : // Outputs
++ [rv]"+r"(rv),
++ [low]"+r"(low),
++ [range]"=&r"(range),
++ [ptr]"=&r"(ptr),
++ [tmp]"=&r"(tmp)
++ : // Inputs
++ [c]"r"(c),
++ [low_off]"J"(offsetof(CABACContext, low)),
++ [range_off]"J"(offsetof(CABACContext, range)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++ [end_off]"J"(offsetof(CABACContext, bytestream_end))
++ : // Clobbers
++ "memory", "cc"
++ );
++ return rv;
++}
++
+ #endif /* HAVE_ARMV6T2_INLINE */
+
+ #endif /* AVCODEC_ARM_CABAC_H */
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_cabac.h
+@@ -0,0 +1,607 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVC_CABAC_H
++#define AVCODEC_ARM_HEVC_CABAC_H
++
++#include "config.h"
++#if HAVE_ARMV6T2_INLINE
++
++#define hevc_mem_bits32 hevc_mem_bits32_arm
++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
++{
++ unsigned int n;
++ __asm__ (
++ "rev %[n], %[x] \n\t"
++ : [n]"=r"(n)
++ : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
++ :
++ );
++ return n << (bits & 7);
++}
++
++
++// ---------------------------------------------------------------------------
++//
++// Helper fns - little bits of code where ARM has an instraction that the
++// compiler doesn't know about / use
++
++#define trans_scale_sat trans_scale_sat_arm
++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++ int rv;
++ int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
++
++ __asm__ (
++ "ssat %[rv], #16, %[t], ASR #1 \n\t"
++ : [rv]"=r"(rv)
++ : [t]"r"(t)
++ :
++ );
++ return rv;
++}
++
++#define update_rice update_rice_arm
++static inline void update_rice_arm(uint8_t * const stat_coeff,
++ const unsigned int last_coeff_abs_level_remaining,
++ const unsigned int c_rice_param)
++{
++ int t = last_coeff_abs_level_remaining << 1;
++ __asm__ (
++ "lsrs %[t], %[t], %[shift] \n\t"
++
++ "it eq \n\t"
++ "subeq %[stat], %[stat], #1 \n\t"
++ "cmp %[t], #6 \n\t"
++ "adc %[stat], %[stat], #0 \n\t"
++ "usat %[stat], #8, %[stat] \n\t"
++ : [stat]"+r"(*stat_coeff),
++ [t]"+r"(t)
++ : [shift]"r"(c_rice_param)
++ : "cc"
++ );
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC get loops
++//
++// Where the loop is simple enough we can normally do 10-30% better than the
++// compiler
++
++// Get the residual greater than 1 bits
++
++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
++ uint8_t * const state0)
++{
++ unsigned int i, reg_b, st, tmp, bit, rv;
++ __asm__ (
++ "mov %[i] , #0 \n\t"
++ "mov %[rv] , #0 \n\t"
++ "1: \n\t"
++ "add %[i] , %[i] , #1 \n\t"
++ "cmp %[rv] , #0 \n\t"
++ "ite eq \n\t"
++ "usateq %[st] , #2 , %[i] \n\t"
++ "movne %[st] , #0 \n\t"
++ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
++ "and %[tmp] , %[range] , #0xC0 \n\t"
++
++ "ldrb %[bit] , [%[state0], %[st]] \n\t"
++ "add %[r_b] , %[r_b] , %[bit] \n\t"
++ "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t"
++ "sub %[range] , %[range] , %[tmp] \n\t"
++
++ "cmp %[low] , %[range], lsl #17 \n\t"
++ "ittt ge \n\t"
++ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
++ "movge %[range] , %[tmp] \n\t"
++ "mvnge %[bit] , %[bit] \n\t"
++
++ "clz %[tmp] , %[range] \n\t"
++ "sub %[tmp] , #23 \n\t"
++ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
++ "and %[bit] , %[bit] , #1 \n\t"
++ "strb %[r_b] , [%[state0], %[st]] \n\t"
++ "lsl %[low] , %[low] , %[tmp] \n\t"
++ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t"
++ "lsl %[range] , %[range] , %[tmp] \n\t"
++
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++ "lsls %[tmp] , %[low] , #16 \n\t"
++ "it ne \n\t"
++ "cmpne %[n] , %[i] \n\t"
++ "bne 1b \n\t"
++
++// If reload is not required then we must have run out of flags to decode
++ "tst %[tmp] , %[tmp] \n\t"
++ "bne 2f \n\t"
++
++// Do reload
++ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
++ "rbit %[bit] , %[low] \n\t"
++ "movw %[r_b] , #0xFFFF \n\t"
++ "clz %[bit] , %[bit] \n\t"
++ "rev %[tmp] , %[tmp] \n\t"
++ "sub %[bit] , %[bit] , #16 \n\t"
++ "cmp %[n] , %[i] \n\t"
++ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
++
++#if CONFIG_THUMB
++ "lsl %[tmp] , %[tmp] , %[bit] \n\t"
++ "add %[low] , %[low] , %[tmp] \n\t"
++#else
++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t"
++#endif
++
++ "bne 1b \n\t"
++ "2: \n\t"
++ : [bit]"=&r"(bit),
++ [low]"+r"(c->low),
++ [range]"+r"(c->range),
++ [r_b]"=&r"(reg_b),
++ [bptr]"+r"(c->bytestream),
++ [i]"=&r"(i),
++ [tmp]"=&r"(tmp),
++ [st]"=&r"(st),
++ [rv]"=&r"(rv)
++ : [state0]"r"(state0),
++ [n]"r"(n),
++ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++ : "memory", "cc"
++ );
++ return rv;
++}
++
++
++// n must be > 0 on entry
++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
++ unsigned int n,
++ const uint8_t * ctx_map,
++ uint8_t * p)
++{
++ unsigned int reg_b, tmp, st, bit;
++ __asm__ (
++// Get bin from map
++#if CONFIG_THUMB
++ "add %[ctx_map] , %[n] \n\t"
++ "ldrb %[st] , [%[ctx_map]] \n\t"
++#else
++ "ldrb %[st] , [%[ctx_map], %[n]]! \n\t"
++#endif
++ "1: \n\t"
++
++// Load state & ranges
++ "ldrb %[bit] , [%[state0], %[st]] \n\t"
++ "and %[tmp] , %[range] , #0xC0 \n\t"
++ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
++ "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t"
++ "ldrb %[tmp] , [%[r_b], %[bit]] \n\t"
++ "sub %[range] , %[range] , %[tmp] \n\t"
++
++ "cmp %[low] , %[range], lsl #17 \n\t"
++ "ittt ge \n\t"
++ "mvnge %[bit] , %[bit] \n\t"
++ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
++ "movge %[range] , %[tmp] \n\t"
++
++// Renorm
++ "clz %[tmp] , %[range] \n\t"
++ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
++ "sub %[tmp] , #23 \n\t"
++ "strb %[r_b] , [%[state0], %[st]] \n\t"
++ "tst %[bit] , #1 \n\t"
++ "ldrb %[st] , [%[ctx_map], #-1]! \n\t"
++ "lsl %[low] , %[low] , %[tmp] \n\t"
++// GCC asm seems to need strbne written differently for thumb and arm
++#if CONFIG_THUMB
++ "it ne \n\t"
++ "strbne %[n] , [%[idx]] , #1 \n\t"
++#else
++ "strneb %[n] , [%[idx]] , #1 \n\t"
++#endif
++
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++ "subs %[n] , %[n] , #1 \n\t"
++ "lsl %[range] , %[range] , %[tmp] \n\t"
++#if CONFIG_THUMB
++ "itt ne \n\t"
++ "lslsne %[tmp] , %[low] , #16 \n\t"
++#else
++ "lslnes %[tmp] , %[low] , #16 \n\t"
++#endif
++ "bne 1b \n\t"
++
++// If we have bits left then n must be 0 so give up now
++ "lsls %[tmp] , %[low] , #16 \n\t"
++ "bne 2f \n\t"
++
++// Do reload
++ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
++ "rbit %[bit] , %[low] \n\t"
++ "movw %[r_b] , #0xFFFF \n\t"
++ "clz %[bit] , %[bit] \n\t"
++ "cmp %[n] , #0 \n\t"
++ "rev %[tmp] , %[tmp] \n\t"
++ "sub %[bit] , %[bit] , #16 \n\t"
++ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
++
++#if CONFIG_THUMB
++ "lsl %[tmp] , %[tmp] , %[bit] \n\t"
++ "add %[low] , %[low] , %[tmp] \n\t"
++#else
++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t"
++#endif
++
++// Check to see if we still have more to do
++ "bne 1b \n\t"
++ "2: \n\t"
++ : [bit]"=&r"(bit),
++ [low]"+r"(c->low),
++ [range]"+r"(c->range),
++ [r_b]"=&r"(reg_b),
++ [bptr]"+r"(c->bytestream),
++ [idx]"+r"(p),
++ [n]"+r"(n),
++ [tmp]"=&r"(tmp),
++ [st]"=&r"(st),
++ [ctx_map]"+r"(ctx_map)
++ : [state0]"r"(state0),
++ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++ : "memory", "cc"
++ );
++
++ return p;
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC_BY22 functions
++
++
++#define get_cabac_by22_start get_cabac_by22_start_arm
++static inline void get_cabac_by22_start_arm(CABACContext * const c)
++{
++ const uint8_t *ptr = c->bytestream;
++ register uint32_t low __asm__("r1"), range __asm__("r2");
++ uint32_t m, range8, bits;
++#if !USE_BY22_DIV
++ uintptr_t inv;
++#endif
++
++ av_assert2(offsetof (CABACContext, low) == 0);
++ av_assert2(offsetof (CABACContext, range) == 4);
++ av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
++ __asm__ volatile (
++ "ldmia %[c], {%[low], %[range]} \n\t"
++ : // Outputs
++ [low]"=r"(low),
++ [range]"=r"(range)
++ : // Inputs
++ [c]"r"(c)
++ : // Clobbers
++ );
++#if !USE_BY22_DIV
++ inv = (uintptr_t)cabac_by22_inv_range;
++#endif
++ __asm__ volatile (
++ "ldr %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
++#if !USE_BY22_DIV
++ "uxtb %[range8], %[range] \n\t"
++#endif
++ "rbit %[bits], %[low] \n\t"
++ "lsl %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
++ "clz %[bits], %[bits] \n\t"
++ "str %[ptr], [%[c], %[ptr_off]] \n\t"
++ "rev %[m], %[m] \n\t"
++ "rsb %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
++ "eor %[m], %[m], #0x80000000 \n\t"
++#if !USE_BY22_DIV
++ "ldr %[inv], [%[inv], %[range8], lsl #2] \n\t"
++ "pkhbt %[range], %[bits], %[range], lsl #16 \n\t"
++ "str %[range], [%[c], %[bits_off]] \n\t"
++#else
++ "strh %[bits], [%[c], %[bits_off]] \n\t"
++#endif
++#if CONFIG_THUMB
++ "lsr %[m], %[ptr] \n\t"
++ "eor %[range], %[low], %[m] \n\t"
++#else
++ "eor %[range], %[low], %[m], lsr %[ptr] \n\t"
++#endif
++ : // Outputs
++ [ptr]"+&r"(ptr),
++ [low]"+&r"(low),
++ [range]"+&r"(range),
++#if !USE_BY22_DIV
++ [inv]"+&r"(inv),
++#endif
++ [m]"=&r"(m),
++ [range8]"=&r"(range8),
++ [bits]"=&r"(bits)
++ : // Inputs
++ [c]"r"(c),
++ [bits_off]"J"(offsetof (CABACContext, by22.bits)),
++ [ptr_off]"J"(offsetof (CABACContext, bytestream))
++ : // Clobbers
++ "memory"
++ );
++ c->low = range;
++#if !USE_BY22_DIV
++ c->range = inv;
++#endif
++}
++
++#define get_cabac_by22_peek get_cabac_by22_peek_arm
++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
++{
++ uint32_t rv = c->low &~ 1, tmp;
++ __asm__ (
++ "cmp %[inv] , #0 \n\t"
++ "it ne \n\t"
++ "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t"
++ : // Outputs
++ [rv]"+r"(rv),
++ [tmp]"=r"(tmp)
++ : // Inputs
++ [inv]"r"(c->range)
++ : // Clobbers
++ "cc"
++ );
++ return rv << 1;
++}
++
++#define get_cabac_by22_flush get_cabac_by22_flush_arm
++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
++{
++ uint32_t bits, ptr, tmp1, tmp2;
++ __asm__ volatile (
++ "ldrh %[bits], [%[cc], %[bits_off]] \n\t"
++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t"
++ "rsb %[tmp1], %[n], #32 \n\t"
++ "add %[bits], %[bits], %[n] \n\t"
++ "ldrh %[tmp2], [%[cc], %[range_off]] \n\t"
++ "lsr %[tmp1], %[val], %[tmp1] \n\t"
++ "ldr %[val], [%[cc], %[low_off]] \n\t"
++#if CONFIG_THUMB
++ "add %[ptr], %[ptr], %[bits], lsr #3 \n\t"
++ "ldr %[ptr], [%[ptr]] \n\t"
++#else
++ "ldr %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
++#endif
++ "mul %[tmp1], %[tmp2], %[tmp1] \n\t"
++ "and %[tmp2], %[bits], #7 \n\t"
++ "strh %[bits], [%[cc], %[bits_off]] \n\t"
++ "rev %[ptr], %[ptr] \n\t"
++ "lsl %[tmp1], %[tmp1], #23 \n\t"
++#if CONFIG_THUMB
++ "lsl %[val], %[n] \n\t"
++ "sub %[val], %[tmp1] \n\t"
++#else
++ "rsb %[val], %[tmp1], %[val], lsl %[n] \n\t"
++#endif
++ "lsl %[ptr], %[ptr], %[tmp2] \n\t"
++ "orr %[val], %[val], %[ptr], lsr #9 \n\t"
++ "str %[val], [%[cc], %[low_off]] \n\t"
++ : // Outputs
++ [val]"+r"(val),
++ [bits]"=&r"(bits),
++ [ptr]"=&r"(ptr),
++ [tmp1]"=&r"(tmp1),
++ [tmp2]"=&r"(tmp2)
++ : // Inputs
++ [cc]"r"(c),
++ [n]"r"(n),
++ [bits_off]"J"(offsetof(CABACContext, by22.bits)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++ [range_off]"J"(offsetof(CABACContext, by22.range)),
++ [low_off]"J"(offsetof(CABACContext, low))
++ : // Clobbers
++ "memory"
++ );
++}
++
++#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
++static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
++{
++ uint32_t last_coeff_abs_level_remaining;
++ uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
++ __asm__ volatile (
++ "ldr %[remain], [%[cc], %[low_off]] \n\t"
++ "ldr %[prefix], [%[cc], %[range_off]] \n\t"
++ "bic %[remain], %[remain], #1 \n\t"
++ "ldrh %[tmp2], [%[cc], %[by22_bits_off]] \n\t"
++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t"
++ "cmp %[prefix], #0 \n\t"
++ "it ne \n\t"
++ "umullne %[prefix], %[remain], %[prefix], %[remain] \n\t"
++ "ldrh %[range], [%[cc], %[by22_range_off]] \n\t"
++ "lsl %[remain], %[remain], #1 \n\t"
++ "mvn %[prefix], %[remain] \n\t"
++ "clz %[prefix], %[prefix] \n\t"
++ "rsbs %[n1], %[prefix], #2 \n\t"
++ "bcc 1f \n\t"
++ "adc %[n1], %[rice], %[prefix] \n\t"
++ "add %[tmp2], %[tmp2], %[n1] \n\t"
++ "rsb %[n2], %[n1], #32 \n\t"
++ "and %[tmp1], %[tmp2], #7 \n\t"
++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t"
++ "lsr %[tmp2], %[tmp2], #3 \n\t"
++ "lsr %[n2], %[remain], %[n2] \n\t"
++ "mul %[n2], %[range], %[n2] \n\t"
++ "ldr %[range], [%[cc], %[low_off]] \n\t"
++ "ldr %[ptr], [%[ptr], %[tmp2]] \n\t"
++ "rsb %[tmp2], %[rice], #31 \n\t"
++ "lsl %[remain], %[remain], %[prefix] \n\t"
++ "lsl %[n2], %[n2], #23 \n\t"
++#if CONFIG_THUMB
++ "lsl %[range], %[n1] \n\t"
++ "sub %[range], %[n2] \n\t"
++#else
++ "rsb %[range], %[n2], %[range], lsl %[n1] \n\t"
++#endif
++ "rev %[ptr], %[ptr] \n\t"
++ "lsl %[n2], %[prefix], %[rice] \n\t"
++#if CONFIG_THUMB
++ "lsr %[remain], %[tmp2] \n\t"
++ "add %[remain], %[n2] \n\t"
++#else
++ "add %[remain], %[n2], %[remain], lsr %[tmp2] \n\t"
++#endif
++ "b 3f \n\t"
++ "1: \n\t"
++ "add %[n2], %[rice], %[prefix], lsl #1 \n\t"
++ "cmp %[n2], %[peek_bits_plus_2] \n\t"
++ "bhi 2f \n\t"
++ "sub %[n1], %[n2], #2 \n\t"
++ "add %[tmp2], %[tmp2], %[n1] \n\t"
++ "rsb %[n2], %[n1], #32 \n\t"
++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t"
++ "lsr %[tmp1], %[tmp2], #3 \n\t"
++ "lsr %[n2], %[remain], %[n2] \n\t"
++ "mul %[n2], %[range], %[n2] \n\t"
++ "rsb %[range], %[rice], #34 \n\t"
++ "ldr %[ptr], [%[ptr], %[tmp1]] \n\t"
++ "and %[tmp1], %[tmp2], #7 \n\t"
++ "lsl %[remain], %[remain], %[prefix] \n\t"
++ "ldr %[tmp2], [%[cc], %[low_off]] \n\t"
++ "rsb %[prefix], %[prefix], %[range] \n\t"
++ "orr %[remain], %[remain], #0x80000000 \n\t"
++ "rev %[ptr], %[ptr] \n\t"
++ "lsl %[n2], %[n2], #23 \n\t"
++ "mov %[range], #2 \n\t"
++#if CONFIG_THUMB
++ "lsl %[tmp2], %[n1] \n\t"
++ "sub %[tmp2], %[n2] \n\t"
++#else
++ "rsb %[tmp2], %[n2], %[tmp2], lsl %[n1] \n\t"
++#endif
++ "lsl %[ptr], %[ptr], %[tmp1] \n\t"
++ "lsl %[rice], %[range], %[rice] \n\t"
++ "orr %[range], %[tmp2], %[ptr], lsr #9 \n\t"
++#if CONFIG_THUMB
++ "lsr %[remain], %[prefix] \n\t"
++ "add %[remain], %[rice] \n\t"
++#else
++ "add %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
++#endif
++ "b 4f \n\t"
++ "2: \n\t"
++ "add %[n1], %[tmp2], %[prefix] \n\t"
++#if CONFIG_THUMB
++ "add %[tmp2], %[ptr], %[n1], lsr #3 \n\t"
++ "ldr %[tmp2], [%[tmp2]] \n\t"
++#else
++ "ldr %[tmp2], [%[ptr], %[n1], lsr #3] \n\t"
++#endif
++ "rsb %[tmp1], %[prefix], #32 \n\t"
++ "push {%[rice]} \n\t"
++ "and %[rice], %[n1], #7 \n\t"
++ "lsr %[tmp1], %[remain], %[tmp1] \n\t"
++ "ldr %[ptr], [%[cc], %[low_off]] \n\t"
++ "mul %[remain], %[range], %[tmp1] \n\t"
++ "rev %[tmp2], %[tmp2] \n\t"
++ "rsb %[n2], %[prefix], %[n2] \n\t"
++ "ldr %[tmp1], [%[cc], %[range_off]] \n\t"
++ "lsl %[rice], %[tmp2], %[rice] \n\t"
++ "sub %[tmp2], %[n2], #2 \n\t"
++ "lsl %[remain], %[remain], #23 \n\t"
++#if CONFIG_THUMB
++ "lsl %[ptr], %[prefix] \n\t"
++ "rsb %[remain], %[ptr] \n\t"
++#else
++ "rsb %[remain], %[remain], %[ptr], lsl %[prefix] \n\t"
++#endif
++ "orr %[remain], %[remain], %[rice], lsr #9 \n\t"
++ "add %[prefix], %[n1], %[tmp2] \n\t"
++ "bic %[n1], %[remain], #1 \n\t"
++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t"
++ "cmp %[tmp1], #0 \n\t"
++ "rsb %[rice], %[tmp2], #32 \n\t"
++ "it ne \n\t"
++ "umullne %[tmp1], %[n1], %[tmp1], %[n1] \n\t"
++ "and %[tmp1], %[prefix], #7 \n\t"
++#if CONFIG_THUMB
++ "add %[ptr], %[ptr], %[prefix], lsr #3 \n\t"
++ "ldr %[ptr], [%[ptr]] \n\t"
++#else
++ "ldr %[ptr], [%[ptr], %[prefix], lsr #3] \n\t"
++#endif
++ "lsl %[n1], %[n1], #1 \n\t"
++ "lsr %[rice], %[n1], %[rice] \n\t"
++ "rsb %[n2], %[n2], #34 \n\t"
++ "mul %[range], %[range], %[rice] \n\t"
++ "pop {%[rice]} \n\t"
++ "rev %[ptr], %[ptr] \n\t"
++ "orr %[n1], %[n1], #0x80000000 \n\t"
++ "strh %[prefix], [%[cc], %[by22_bits_off]] \n\t"
++ "mov %[prefix], #2 \n\t"
++ "lsl %[range], %[range], #23 \n\t"
++#if CONFIG_THUMB
++ "lsl %[remain], %[tmp2] \n\t"
++ "rsb %[range], %[remain] \n\t"
++#else
++ "rsb %[range], %[range], %[remain], lsl %[tmp2] \n\t"
++#endif
++ "lsl %[remain], %[prefix], %[rice] \n\t"
++#if CONFIG_THUMB
++ "lsr %[n1], %[n2] \n\t"
++ "add %[remain], %[n1] \n\t"
++#else
++ "add %[remain], %[remain], %[n1], lsr %[n2] \n\t"
++#endif
++ "3: \n\t"
++ "lsl %[ptr], %[ptr], %[tmp1] \n\t"
++ "orr %[range], %[range], %[ptr], lsr #9 \n\t"
++ "4: \n\t"
++ "str %[range], [%[cc], %[low_off]] \n\t"
++ : // Outputs
++ [remain]"=&r"(last_coeff_abs_level_remaining),
++ [rice]"+r"(rice_param),
++ [prefix]"=&r"(prefix),
++ [n1]"=&r"(n1),
++ [range]"=&r"(range),
++ [n2]"=&r"(n2),
++ [ptr]"=&r"(ptr),
++ [tmp1]"=&r"(tmp1),
++ [tmp2]"=&r"(tmp2)
++ : // Inputs
++ [cc]"r"(c),
++ [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
++ [low_off]"J"(offsetof(CABACContext, low)),
++ [range_off]"J"(offsetof(CABACContext, range)),
++ [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
++ [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
++ [ptr_off]"J"(offsetof(CABACContext, bytestream))
++ : // Clobbers
++ "cc", "memory"
++ );
++ return last_coeff_abs_level_remaining;
++}
++
++#endif /* HAVE_ARMV6T2_INLINE */
++
++#endif /* AVCODEC_ARM_HEVC_CABAC_H */
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
+@@ -0,0 +1,183 @@
++/*
++ * ARM NEON optimised IDCT functions for HEVC decoding
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++@ Included multiple times from hevc_idct_neon.S
++@ Macros defined there
++
++#define DC_SHIFT (15 - BIT_DEPTH)
++#define DC_ADD (1 | (1 << (14 - BIT_DEPTH)))
++#define TRN_SHIFT (20 - BIT_DEPTH)
++
++function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
++ ldrsh r1, [r0]
++ add r1, #DC_ADD
++ asr r1, #DC_SHIFT
++ vdup.16 q0, r1
++ vdup.16 q1, r1
++ vst1.16 {q0, q1}, [r0]
++ bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
++ ldrsh r1, [r0]
++ add r2, r0, #32
++ mov r3, #64
++ add r1, #DC_ADD
++ asr r1, #DC_SHIFT
++ vdup.16 q8, r1
++ vdup.16 q9, r1
++ vst1.16 {q8, q9}, [r0], r3
++ vst1.16 {q8, q9}, [r2], r3
++ vst1.16 {q8, q9}, [r0]
++ vst1.16 {q8, q9}, [r2]
++ bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
++ ldrsh r1, [r0]
++ add r2, r0, #32
++ mov r3, #64
++ add r1, #DC_ADD
++ mov ip, #16*16
++ asr r1, #DC_SHIFT
++ vdup.16 q8, r1
++ vdup.16 q9, r1
++1: vst1.16 {q8, q9}, [r0], r3
++ subs ip, ip, #32
++ vst1.16 {q8, q9}, [r2], r3
++ bhi 1b
++ bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
++ ldrsh r1, [r0]
++ add r2, r0, #32
++ mov r3, #64
++ add r1, #DC_ADD
++ mov ip, #32*32
++ asr r1, #DC_SHIFT
++ vdup.16 q8, r1
++ vdup.16 q9, r1
++1: vst1.16 {q8, q9}, [r0], r3
++ subs ip, ip, #32
++ vst1.16 {q8, q9}, [r2], r3
++ bhi 1b
++ bx lr
++endfunc
++
++
++function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
++ vldr.i32 s0, =0x00240053 // 36 and 83
++ vld1.16 {q14, q15}, [r0 :256] // coeffs
++
++ tr4_shift #7
++
++ vzip.16 d28, d29
++ vzip.16 d30, d31
++ vzip.32 q14, q15
++
++ tr4_shift #TRN_SHIFT
++
++ vst4.16 {q14, q15}, [r0 :256]
++ bx lr
++
++ .ltorg
++endfunc
++
++
++
++function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
++ vmov.i32 d0, #0x4a // 74
++ vld1.16 {q14, q15}, [r0 :256] // coeffs
++ vmov.i32 d1, #0x1d // 29
++ vmov.i32 d2, #0x37 // 55
++
++ tr4_luma_shift #7
++
++ vzip.16 d28, d29
++ vzip.16 d30, d31
++ vzip.32 q14, q15
++
++ tr4_luma_shift #TRN_SHIFT
++
++ vst4.16 {q14, q15}, [r0 :256]
++ bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
++ add r2, r0, #16
++ adr r3, tr4f
++ vpush {d8-d15}
++ vld1.16 {d0, d1}, [r3]
++ mov r3, #32
++
++ tr8_vert d16, d17, d18, d19, d24, d25, d26, d27, q8, q9, \
++ "sub r0, r0, #128-8", \
++ "sub r2, r2, #128-8", \
++ "cmp r1, #4"
++ ble 2f
++
++ tr8_vert d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
++ "sub r0, r0, #128+8", \
++ "sub r2, r2, #128+8+16-32", \
++ "mov r3, #64"
++
++ vzip.16 d16, d17
++ vzip.16 d18, d19
++
++ vzip.16 d20, d21
++ vzip.16 d22, d23
++ vzip.16 d28, d29
++ vzip.16 d30, d31
++ vzip.32 q10, q11
++ vzip.32 q14, q15
++1:
++ vzip.16 d24, d25
++ vzip.16 d26, d27
++ vzip.32 q8, q9
++ vzip.32 q12, q13
++
++ tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8, q9, TRN_SHIFT
++ tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
++
++ vpop {d8-d15}
++ bx lr
++
++2: vmov.i64 q10, #0
++ sub r0, r0, #8
++ vmov.i64 q11, #0
++ sub r2, r2, #8+16-32
++ vmov.i64 q14, #0
++ mov r3, #64
++ vmov.i64 q15, #0
++
++ vzip.16 d16, d17
++ vzip.16 d18, d19
++
++ b 1b
++
++endfunc
++
++#undef DC_SHIFT
++#undef DC_ADD
++#undef TRN_SHIFT
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_misc_neon.S
+@@ -0,0 +1,267 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Written by John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ rpi_zap_coeff_vals_neon(
++@ uint16_t * buf, [r0]
++@ unsigned int log_n_m2) [r1]
++
++function rpi_zap_coeff_vals_neon, export=1
++ mov ip, #1
++ vmov.i64 q0, #0
++ teq r1, #0
++ vmov.i64 q1, #0
++ beq 2f
++
++ lsl ip, r1 @ 2, 4 or 8
++ add r2, r0, #32
++ lsl ip, r1 @ 4, 16 or 64 = number of 32-byte blocks to zero
++ mov r3, #64
++1: vst1.8 {q0,q1}, [r0:256], r3
++ subs ip, #2
++ vst1.8 {q0,q1}, [r2:256], r3
++ bne 1b
++ bx lr
++
++2: vst1.8 {q0,q1}, [r0:256]
++ bx lr
++endfunc
++
++@ PIC jump tables are more expensive than absolute for A32 code
++.set jent_pic, CONFIG_PIC || CONFIG_THUMB
++
++@ Jump table entry - if in neon mode the bottom bit must be set
++@ ? There is probably a real asm instruction to do this but I haven't found it
++.macro jent lab
++.if jent_pic
++T .short ((0 + \lab) - (0 + 98b)) / 2
++A .short (0 + \lab) - (4 + 98b)
++.else
++T .word 1 + \lab
++A .word \lab
++.endif
++.endm
++
++.set expected_next, 0
++
++.macro cpy_compound val, p1, p2, drop_thru=0
++.if \p1 + \p2 != \val
++.error "Bad addition! \p1 + \p2 != \val"
++.endif
++.if expected_next != 0 && expected_next != \val
++.error "Drop thru failure"
++.endif
++\val\():
++ push {r0-r3}
++ bl 100\p1\()b
++ pop {r0-r3}
++ add r0, #\p1
++ add r2, #\p1
++.if \drop_thru == 0
++ b \p2\()b
++.set expected_next, 0
++.else
++.set expected_next, \p2
++.endif
++.endm
++
++@ ff_hevc_cpy_blks8x4_neon(
++@ dst [r0]
++@ dst_stride [r1]
++@ src [r2]
++@ src_stride [r3]
++@ width [sp, #0] (bytes)
++@ height) [sp, #4]
++@
++@ Power of 2 widths are directly coded, all others are done in stripes
++@ We expect the vast majority of calls to be power of 2
++@
++@ Currently has min width of 8, but we could make that 4 without issue
++@ Min height is 4
++
++function ff_hevc_rpi_cpy_blks8x4_neon, export=1
++ ldr r12, [sp, #0]
++ push {r11, lr}
++.if jent_pic
++A adr lr, 98f - 2
++.else
++A adr lr, 98f - 4
++.endif
++ lsr r12, #3
++ ldr r11, [sp, #(8 + 4)]
++.if jent_pic
++A lsl r12, #1
++A ldrsh lr, [lr, r12]
++A add pc, lr
++T tbh [pc, r12, lsl #1]
++.else
++ @ A32 only, Thumb is always PIC
++ ldr pc, [lr, r12, lsl #2]
++.endif
++
++98:
++T .short 0 @ unused
++ jent 8f
++ jent 16f
++ jent 24f
++ jent 32f
++ jent 40f
++ jent 48f
++ jent 56f
++ jent 64f
++ jent 72f
++ jent 80f
++ jent 88f
++ jent 96f
++ jent 104f
++ jent 112f
++ jent 120f
++ jent 128f
++
++1008:
++ push {r11, lr}
++8:
++ add lr, r2, r3
++ lsl r3, #1
++ add r12, r0, r1
++ lsl r1, #1
++1:
++ vld1.32 {d0 }, [r2], r3
++ vld1.32 {d1 }, [lr], r3
++ vld1.32 {d2 }, [r2], r3
++ vld1.32 {d3 }, [lr], r3
++ subs r11, #4
++ vst1.32 {d0 }, [r0], r1
++ vst1.32 {d1 }, [r12], r1
++ vst1.32 {d2 }, [r0], r1
++ vst1.32 {d3 }, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++10016:
++ push {r11, lr}
++16:
++ add lr, r2, r3
++ lsl r3, #1
++ add r12, r0, r1
++ lsl r1, #1
++1:
++ vld1.32 {q0 }, [r2], r3
++ vld1.32 {q1 }, [lr], r3
++ vld1.32 {q2 }, [r2], r3
++ vld1.32 {q3 }, [lr], r3
++ subs r11, #4
++ vst1.32 {q0 }, [r0], r1
++ vst1.32 {q1 }, [r12], r1
++ vst1.32 {q2 }, [r0], r1
++ vst1.32 {q3 }, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++10032:
++ push {r11, lr}
++32:
++ add lr, r2, r3
++ lsl r3, #1
++ add r12, r0, r1
++ lsl r1, #1
++1:
++ vld1.32 {q8, q9 }, [r2], r3
++ vld1.32 {q10, q11}, [lr], r3
++ vld1.32 {q12, q13}, [r2], r3
++ vld1.32 {q14, q15}, [lr], r3
++ subs r11, #4
++ vst1.32 {q8, q9 }, [r0], r1
++ vst1.32 {q10, q11}, [r12], r1
++ vst1.32 {q12, q13}, [r0], r1
++ vst1.32 {q14, q15}, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++10064:
++ push {r11, lr}
++64:
++ add lr, r2, #32
++ add r12, r0, #32
++1:
++ vld1.32 {q8, q9 }, [r2], r3
++ vld1.32 {q10, q11}, [lr], r3
++ vld1.32 {q12, q13}, [r2], r3
++ vld1.32 {q14, q15}, [lr], r3
++ subs r11, #2
++ vst1.32 {q8, q9 }, [r0], r1
++ vst1.32 {q10, q11}, [r12], r1
++ vst1.32 {q12, q13}, [r0], r1
++ vst1.32 {q14, q15}, [r12], r1
++ bgt 1b
++ pop {r11, pc}
++
++128:
++ push {r4, r5}
++ @ We could do this with fewer registers if we jump around but I
++ @ have a primative urge to load sequentially
++ mov r4, #64
++ add lr, r2, #32
++ add r12, r0, #32
++ sub r3, r4
++ sub r1, r4
++1:
++ vld1.32 {q8, q9 }, [r2], r4
++ vld1.32 {q10, q11}, [lr], r4
++ vld1.32 {q12, q13}, [r2], r3
++ vld1.32 {q14, q15}, [lr], r3
++ subs r11, #1
++ vst1.32 {q8, q9 }, [r0], r4
++ vst1.32 {q10, q11}, [r12], r4
++ vst1.32 {q12, q13}, [r0], r1
++ vst1.32 {q14, q15}, [r12], r1
++ bgt 1b
++ pop {r4, r5, r11, pc}
++
++@ Use drop_thru where we can
++cpy_compound 104, 64, 40, 1
++cpy_compound 40, 32, 8
++
++cpy_compound 112, 64, 48, 1
++cpy_compound 48, 32, 16
++
++cpy_compound 120, 64, 56, 1
++cpy_compound 56, 32, 24, 1
++cpy_compound 24, 16, 8
++
++cpy_compound 72, 64, 8
++cpy_compound 80, 64, 16
++cpy_compound 88, 64, 24
++cpy_compound 96, 64, 32
++
++
++endfunc
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_misc_neon.h
+@@ -0,0 +1,438 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
++#define AVCODEC_ARM_RPI_HEVC_MISC_H
++
++#include "config.h"
++#if HAVE_NEON_INLINE && !CONFIG_THUMB
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_src)
++{
++ const uint8_t *src2 = src + stride_src;
++ stride_src <<= 1;
++ switch (pixel_shift)
++ {
++ case 2:
++ __asm__ volatile (
++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t"
++ "subs %[height], #4 \n\t"
++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.32 {d2[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d2[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.32 {d3[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d3[1]}, [%[src2]], %[stride_src] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.32 {q0}, [%[dst]]! \n\t"
++ "beq 3f \n\t"
++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.32 {q1}, [%[dst]]! \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vst1.32 {q0}, [%[dst]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vst1.32 {q1}, [%[dst]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [src]"+r"(src),
++ [src2]"+r"(src2),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ case 1:
++ __asm__ volatile (
++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t"
++ "subs %[height], #4 \n\t"
++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.16 {d2[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d3[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.16 {d2[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d3[1]}, [%[src2]], %[stride_src] \n\t"
++ "vzip.16 d0, d1 \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.16 {d0}, [%[dst]]! \n\t"
++ "beq 3f \n\t"
++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "vzip.16 d2, d3 \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.16 {d2}, [%[dst]]! \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vzip.16 d0, d1 \n\t"
++ "vst1.16 {d0}, [%[dst]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vzip.16 d2, d3 \n\t"
++ "vst1.16 {d2}, [%[dst]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [src]"+r"(src),
++ [src2]"+r"(src2),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ default:
++ __asm__ volatile (
++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t"
++ "subs %[height], #8 \n\t"
++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.8 {d2[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d3[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d2[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d3[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d2[2]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d3[2]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d2[3]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d3[3]}, [%[src2]], %[stride_src] \n\t"
++ "vzip.8 d0, d1 \n\t"
++ "subs %[height], #8 \n\t"
++ "vst1.8 {d0}, [%[dst]]! \n\t"
++ "beq 3f \n\t"
++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t"
++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t"
++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t"
++ "vzip.8 d2, d3 \n\t"
++ "subs %[height], #8 \n\t"
++ "vst1.8 {d2}, [%[dst]]! \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vzip.8 d0, d1 \n\t"
++ "vst1.8 {d0}, [%[dst]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vzip.8 d2, d3 \n\t"
++ "vst1.8 {d2}, [%[dst]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [src]"+r"(src),
++ [src2]"+r"(src2),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_dst)
++{
++ uint8_t *dst2 = dst + stride_dst;
++ stride_dst <<= 1;
++ switch (pixel_shift)
++ {
++ case 2:
++ __asm__ volatile (
++ "subs %[height], #4 \n\t"
++ "vld1.32 {q0}, [%[src]]! \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.32 {q1}, [%[src]]! \n\t"
++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.32 {d1[0]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.32 {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "beq 3f \n\t"
++ "vld1.32 {q0}, [%[src]]! \n\t"
++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.32 {d3[0]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.32 {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.32 {d1[0]}, [%[dst]] \n\t"
++ "vst1.32 {d1[1]}, [%[dst2]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.32 {d3[0]}, [%[dst]] \n\t"
++ "vst1.32 {d3[1]}, [%[dst2]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [dst]"+r"(dst),
++ [dst2]"+r"(dst2),
++ [src]"+r"(src),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ case 1:
++ __asm__ volatile (
++ "subs %[height], #4 \n\t"
++ "vld1.16 {d0}, [%[src]]! \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.16 {d2}, [%[src]]! \n\t"
++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.16 {d0[2]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.16 {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "beq 3f \n\t"
++ "vld1.16 {d0}, [%[src]]! \n\t"
++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.16 {d2[2]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #4 \n\t"
++ "vst1.16 {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.16 {d0[2]}, [%[dst]] \n\t"
++ "vst1.16 {d0[3]}, [%[dst2]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.16 {d2[2]}, [%[dst]] \n\t"
++ "vst1.16 {d2[3]}, [%[dst2]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [dst]"+r"(dst),
++ [dst2]"+r"(dst2),
++ [src]"+r"(src),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ default:
++ __asm__ volatile (
++ "subs %[height], #8 \n\t"
++ "vld1.8 {d0}, [%[src]]! \n\t"
++ "beq 2f \n\t"
++ "1: \n\t"
++ "vld1.8 {d2}, [%[src]]! \n\t"
++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[6]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #8 \n\t"
++ "vst1.8 {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
++ "beq 3f \n\t"
++ "vld1.8 {d0}, [%[src]]! \n\t"
++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[6]}, [%[dst]], %[stride_dst] \n\t"
++ "subs %[height], #8 \n\t"
++ "vst1.8 {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "2: \n\t"
++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d0[6]}, [%[dst]] \n\t"
++ "vst1.8 {d0[7]}, [%[dst2]] \n\t"
++ "b 4f \n\t"
++ "3: \n\t"
++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t"
++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++ "vst1.8 {d2[6]}, [%[dst]] \n\t"
++ "vst1.8 {d2[7]}, [%[dst2]] \n\t"
++ "4: \n\t"
++ : // Outputs
++ [dst]"+r"(dst),
++ [dst2]"+r"(dst2),
++ [src]"+r"(src),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++ int x, y;
++ switch (pixel_shift)
++ {
++ case 2:
++ __asm__ volatile (
++ "ldr %[x], [%[src]], %[stride_src] \n\t"
++ "ldr %[y], [%[src]], %[stride_src] \n\t"
++ "str %[x], [%[dst]], %[stride_dst] \n\t"
++ "sub %[height], #2 \n\t"
++ "1: \n\t"
++ "ldr %[x], [%[src]], %[stride_src] \n\t"
++ "str %[y], [%[dst]], %[stride_dst] \n\t"
++ "ldr %[y], [%[src]], %[stride_src] \n\t"
++ "subs %[height], #2 \n\t"
++ "str %[x], [%[dst]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "str %[y], [%[dst]] \n\t"
++ : // Outputs
++ [x]"=&r"(x),
++ [y]"=&r"(y),
++ [src]"+r"(src),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src),
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ case 1:
++ __asm__ volatile (
++ "ldrh %[x], [%[src]], %[stride_src] \n\t"
++ "ldrh %[y], [%[src]], %[stride_src] \n\t"
++ "strh %[x], [%[dst]], %[stride_dst] \n\t"
++ "sub %[height], #2 \n\t"
++ "1: \n\t"
++ "ldrh %[x], [%[src]], %[stride_src] \n\t"
++ "strh %[y], [%[dst]], %[stride_dst] \n\t"
++ "ldrh %[y], [%[src]], %[stride_src] \n\t"
++ "subs %[height], #2 \n\t"
++ "strh %[x], [%[dst]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "strh %[y], [%[dst]] \n\t"
++ : // Outputs
++ [x]"=&r"(x),
++ [y]"=&r"(y),
++ [src]"+r"(src),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src),
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ default:
++ __asm__ volatile (
++ "ldrb %[x], [%[src]], %[stride_src] \n\t"
++ "ldrb %[y], [%[src]], %[stride_src] \n\t"
++ "strb %[x], [%[dst]], %[stride_dst] \n\t"
++ "sub %[height], #2 \n\t"
++ "1: \n\t"
++ "ldrb %[x], [%[src]], %[stride_src] \n\t"
++ "strb %[y], [%[dst]], %[stride_dst] \n\t"
++ "ldrb %[y], [%[src]], %[stride_src] \n\t"
++ "subs %[height], #2 \n\t"
++ "strb %[x], [%[dst]], %[stride_dst] \n\t"
++ "bne 1b \n\t"
++ "strb %[y], [%[dst]] \n\t"
++ : // Outputs
++ [x]"=&r"(x),
++ [y]"=&r"(y),
++ [src]"+r"(src),
++ [dst]"+r"(dst),
++ [height]"+r"(height)
++ : // Inputs
++ [stride_src]"r"(stride_src),
++ [stride_dst]"r"(stride_dst)
++ : // Clobbers
++ "cc", "memory"
++ );
++ break;
++ }
++}
++
++#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
++static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++ if (stride_dst == 1 << pixel_shift)
++ ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
++ else if (stride_src == 1 << pixel_shift)
++ ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
++ else
++ ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
++}
++
++#endif /* HAVE_NEON_INLINE */
++
++#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_mv_arm.h
+@@ -0,0 +1,93 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Written by John Cox, Ben Avison
++*/
++
++#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
++#define AVCODEC_ARM_RPI_HEVC_MV_H
++
++#if HAVE_ARMV6T2_INLINE
++static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
++{
++ MvXY r;
++ __asm__ (
++ "sadd16 %[r], %[a], %[b] \n\t"
++ : [r]"=r"(r)
++ : [a]"r"(a),
++ [b]"r"(b)
++ :
++ );
++ return r;
++}
++#define mvxy_add mvxy_add_arm
++#endif
++
++#if HAVE_ARMV6T2_INLINE
++#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
++static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
++{
++ int t;
++ __asm__ (
++ "ssat %[td], #8, %[td] \n\t"
++ "ssat %[tb], #8, %[tb] \n\t"
++ "eor %[t], %[td], %[td], asr #31 \n\t"
++ "adds %[t], %[t], %[td], lsr #31 \n\t"
++ "asr %[t], #1 \n\t"
++ "add %[t], #0x4000 \n\t"
++ "it ne \n\t"
++ "sdivne %[t], %[t], %[td] \n\t"
++ "mov %[td], #32 \n\t"
++ "smlabb %[td], %[t], %[tb], %[td] \n\t"
++ "ssat %[td], #13, %[td], asr #6 \n\t"
++ "mov %[tb], #127 \n\t"
++ "smlatb %[t], %[xy], %[td], %[tb] \n\t"
++ "smlabb %[tb], %[xy], %[td], %[tb] \n\t"
++// This takes the sign of x & y for rounding at the "wrong" point
++// (i.e. after adding 127) but for the range of values (-1,-127)
++// where it does the wrong thing you get the right answer (0) anyway
++ "add %[t], %[t], %[t], lsr #31 \n\t"
++ "add %[xy], %[tb], %[tb], lsr #31 \n\t"
++ "ssat %[t], #16, %[t], asr #8 \n\t"
++ "ssat %[xy], #16, %[xy], asr #8 \n\t"
++ "pkhbt %[xy], %[xy], %[t], lsl #16 \n\t"
++ :
++ [t]"=&r"(t),
++ [xy]"+r"(xy),
++ [td]"+r"(td),
++ [tb]"+r"(tb)
++ :
++ :
++ "cc"
++ );
++ return xy;
++}
++#define mv_scale_xy mv_scale_xy_arm
++#endif
++#endif
++
++#endif // AVCODEC_ARM_RPI_HEVC_MV_H
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_arm.h
+@@ -0,0 +1,26 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
++#define AVCODEC_ARM_HEVCDSP_ARM_H
++
++#include "libavcodec/rpi_hevcdsp.h"
++
++void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
+@@ -0,0 +1,1634 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
++ vsubl.u8 q0, \Q0a, \P0a
++ vsubl.u8 q1, \P1a, \Q1a
++ vdup.16 d4, r2
++ \I1
++ vshl.i16 q0, #2
++ \I2
++ vadd.i16 q0, q1
++ \I3
++ vmovl.u8 q2, d4
++ \I4
++ vneg.s16 q1, q2
++ \I5
++ vrshr.s16 q0, #3
++ \I6
++ \I7
++ \I8
++ vmin.s16 q0, q2
++ vmovl.u8 q2, \Q0a
++ vmax.s16 q0, q1
++ vaddw.u8 q1, q0, \P0a
++ vsub.i16 q0, q2, q0
++ vqmovun.s16 \P0a, q1
++ vqmovun.s16 \Q0a, q0
++.endm
++
++
++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
++ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a
++ lsr r12, r2, #16
++ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b
++ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a
++ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b
++ vshl.i16 q0, #2 @ (q0a - p0a) * 4
++ vshl.i16 q1, #2 @ (q0b - p0b) * 4
++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
++ vdup.16 d4, r2 @ tc0a, tc0b
++ vdup.16 d6, r12 @ tc1a, tc1b
++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++ \I1
++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++ \I2
++ vmovl.u8 q2, d4 @ tc0a, tc0b
++ \I3
++ vmovl.u8 q3, d6 @ tc1a, tc1b
++ \I4
++ vmin.s16 q0, q2
++ \I5
++ vneg.s16 q2, q2 @ -tc0a, -tc0b
++ \I6
++ vmin.s16 q1, q3
++ \I7
++ vneg.s16 q3, q3 @ -tc1a, -tc1b
++ vmax.s16 q0, q2 @ delta0a
++ vmovl.u8 q2, \Q0a
++ vmax.s16 q1, q3 @ delta0b
++ vaddw.u8 q3, q0, \P0a @ p0a + delta0a
++ vsub.i16 q0, q2, q0 @ q0a - delta0a
++ vmovl.u8 q2, \Q0b
++ vsub.i16 q2, q1 @ q0b - delta0b
++ vaddw.u8 q1, \P0b @ p0b + delta0b
++ vqmovun.s16 \Q0a, q0
++ vqmovun.s16 \P0a, q3
++ vqmovun.s16 \Q0b, q2
++ vqmovun.s16 \P0b, q1
++.endm
++
++
++@ Preserves r12
++@ Clobbers r2
++@ P0a et al all contain UVUVUVUV
++@ r2 (tc4) contains
++@ [0..7] tc U a
++@ [8..15] tc V a
++
++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
++ vsub.i16 q0, \Q0a, \P0a
++ vsub.i16 q1, \P1a, \Q1a
++ vdup.16 d4, r2
++ \I1
++ vshl.i16 q0, #2
++ \I2
++ vadd.i16 q0, q1
++ \I3
++ vshll.u8 q2, d4, #\bit_depth - 8
++ \I4
++ vneg.s16 q1, q2
++ \I5
++ vrshr.s16 q0, #3
++ \I6
++ \I7
++ \I8
++ vmin.s16 q0, q2
++ vmov.i16 q2, #0
++ vmax.s16 q0, q1
++ vadd.i16 \P0a, q0
++ vsub.i16 \Q0a, q0
++ vmov.i16 q1, #(1 << \bit_depth) - 1
++ vmax.s16 \P0a, q2
++ vmax.s16 \Q0a, q2
++ vmin.s16 \P0a, q1
++ vmin.s16 \Q0a, q1
++.endm
++
++@ Clobbers r2, r12
++@ P0a et al all contain UVUVUVUV
++@ r2 (tc4) contains
++@ [0..7] tc U a
++@ [8..15] tc V a
++@ [16..23] tc U b
++@ [24..31] tc V b
++
++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
++ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a
++ lsr r12, r2, #16
++ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b
++ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a
++ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b
++ vshl.i16 q0, #2 @ (q0a - p0a) * 4
++ vshl.i16 q1, #2 @ (q0b - p0b) * 4
++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
++ vdup.16 d4, r2 @ tc0a, tc0b
++ vdup.16 d6, r12 @ tc1a, tc1b
++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++ \I1
++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++ \I2
++ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b
++ \I3
++ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b
++ \I4
++ vmin.s16 q0, q2
++ \I5
++ vneg.s16 q2, q2 @ -tc0a, -tc0b
++ \I6
++ vmin.s16 q1, q3
++ \I7
++ vneg.s16 q3, q3 @ -tc1a, -tc1b
++ vmax.s16 q0, q2 @ delta0a
++ vadd.i16 \P0a, q0 @ p0a + delta0a
++ vsub.i16 \Q0a, q0 @ q0a - delta0a
++ vmax.s16 q1, q3 @ delta0b
++ vadd.i16 \P0b, q1 @ p0b + delta0b
++ vsub.i16 \Q0b, q1 @ q0b - delta0b
++ vmov.i16 q2, #0
++ vmov.i16 q3, #(1 << \bit_depth) - 1
++ vmax.s16 \P0a, q2
++ vmax.s16 \Q0a, q2
++ vmax.s16 \P0b, q2
++ vmax.s16 \Q0b, q2
++ vmin.s16 \P0a, q3
++ vmin.s16 \Q0a, q3
++ vmin.s16 \P0b, q3
++ vmin.s16 \Q0b, q3
++.endm
++
++
++
++@ uint8_t *_no_p, [sp+0]
++@ uint8_t *_no_q) [sp+4]
++
++.macro hevc_loop_filter_luma_start
++ ldr r12, [r3]
++ ldr r3, [r3, #4]
++ orrs r3, r12, r3, lsl #16
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldrd r4, r5, [sp, #32] @ &_no_p
++ ldrb r4, [r4]
++ ldrb r5, [r5]
++ movs r10, r4
++ it ne
++ movne r10, #1
++ cmp r5, #0
++ it ne
++ orrne r10, #2
++.endm
++
++@ Input:
++@ r2 beta (raw: needs shift for bitdepth > 8)
++@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8)
++@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8)
++@
++@ Input & output
++@ 8-bit: d16-d23 (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
++@ 16-bit: q8-q15
++@
++@ r1 -r1
++@ r10 b1->C, b0->N (r10 junk)
++@
++@ Junks:
++@ r5, r6, r7, r8, r9
++
++.macro m_filter_luma bit_depth, Q11, Q15
++.if \bit_depth == 8
++ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
++ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
++ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
++ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
++ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
++ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
++.endif
++ vadd.i16 q0, q9, \Q11 @ P2 + P0
++.if \bit_depth > 8
++ lsl r3, r3, #(\bit_depth - 8)
++.endif
++ vadd.i16 q1, q14, q12 @ Q2 + Q0
++.if \bit_depth > 8
++ lsl r2, r2, #(\bit_depth - 8)
++.endif
++ vsub.i16 q0, q10 @ P2 - P1 + P0
++ lsr r5, r3, #16
++ vsub.i16 q1, q13 @ Q2 - Q1 + Q0
++.if \bit_depth == 8
++ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
++ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
++.endif
++ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0)
++ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0)
++ vmov.i64 q2, #0xffffffff0000
++ vbic q0, q2 @ only dp0(') and dp3(')
++ vbic q1, q2 @ only dq0(') and dq3(')
++ vsra.u64 q0, #16
++ vsra.u64 q1, #16
++ vdup.16 q3, r2 @ beta
++ vdup.16 d14, r3 @ tC[0]
++ vdup.16 d15, r5 @ tC[1]
++ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
++ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0
++ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0
++ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
++ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
++ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
++ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3
++ vshl.s16 q6, q7, #2 @ tC[] * 4
++ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1
++ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta)
++ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block)
++ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3
++ cmp r7, #0
++ beq .Lbypasswrite
++
++ vcgt.s16 q5, q6, q5 @ if < tc25
++ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
++ vand q4, q5
++ vbic d8, d4
++ vbic d9, d4
++ vshr.s16 q3, #2 @ beta_2 = beta >> 2
++ vsra.u64 q4, #16
++ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1
++ vshl.i16 q7, #1 @ tc2 = tC[] << 1
++ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc
++ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half
++ vand d6, d8 @ && beta_2 tests, prime in ms half
++ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
++ vneg.s16 q6, q7 @ -tc2
++ vmovn.i32 d8, q3
++ vshrn.i32 d6, q3, #16
++ vand d6, d8
++ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3
++ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block)
++ vadd.i16 q0, \Q11, q12 @ p0 + q0
++ ands r9, r7, r8
++ beq 1f
++
++ vadd.i16 q2, q0, q10 @ p1 + p0 + q0
++ vadd.i16 q3, q0, q13 @ p0 + q0 + q1
++ lsr r3, r9, #16
++ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping)
++ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping)
++ vadd.i16 q0, q8, q9 @ p3 + p2
++ vadd.i16 q5, \Q15, q14 @ q2 + q3
++ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0
++ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2
++ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2
++ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3
++ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
++ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
++ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
++ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
++ vrshr.s16 q0, #3 @ scale, with rounding
++ vrshr.s16 q5, #3
++ vrshr.s16 q1, #2
++ vrshr.s16 q4, #2
++ vrshr.s16 q2, #3
++ vrshr.s16 q3, #3
++ vsub.i16 q0, q9 @ find difference
++ vsub.i16 q5, q14
++ vsub.i16 q1, q10
++ vsub.i16 q4, q13
++ vsub.i16 q2, \Q11
++ vsub.i16 q3, q12
++ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2
++ vmax.s16 q5, q6
++ vmax.s16 q1, q6
++ vmax.s16 q4, q6
++ vmax.s16 q2, q6
++ vmax.s16 q3, q6
++ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure
++ vdup.16 d13, r3
++ vmin.s16 q0, q7
++ vmin.s16 q5, q7
++ vmin.s16 q1, q7
++ vmin.s16 q4, q7
++ vmin.s16 q2, q7
++ vmin.s16 q3, q7
++ vadd.i16 q0, q9 @ apply difference
++ vadd.i16 q5, q14
++ vadd.i16 q1, q10
++ vadd.i16 q4, q13
++ vadd.i16 q2, \Q11
++ vadd.i16 q3, q12
++ vbit q9, q0, q6 @ apply filtered values according to mask
++ vbit q14, q5, q6
++ vbit q10, q1, q6
++ vbit q13, q4, q6
++ vbit \Q11, q2, q6
++ vbit q12, q3, q6
++ vneg.s16 q6, q7 @ restore -tc2
++
++1:
++ bics r9, r7, r8
++ beq 2f
++
++ vsub.i16 q0, q12, \Q11 @ q0 - p0
++ vsub.i16 q1, q13, q10 @ q1 - p1
++ lsr r3, r9, #16
++ vshl.i16 q2, q0, #3
++ lsr r7, r5, #16
++ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0)
++ lsr r8, r6, #16
++ vshl.i16 q2, q1, #1
++ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1)
++ vshr.s16 q6, #1 @ -tc = -tc2 >> 1
++ vsub.i16 q5, q3, q4
++ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1
++ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1
++ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
++ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1
++ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1
++ vmax.s16 q6, q5 @
++ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1
++ vdup.16 q0, r2 @ beta
++ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc]
++ vshr.s16 q4, #1 @ tc_2 = tc >> 1
++ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
++ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
++ vshr.s16 q2, q0, #1 @ beta >> 1
++ vadd.i16 q2, q0 @ beta + (beta >> 1)
++ vneg.s16 q0, q4 @ -tc_2
++ vabs.s16 q5, q5 @ abs(original delta0)
++ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3
++ vmax.s16 q1, q0
++ vmax.s16 q3, q0
++ vshl.s16 q0, q7, #2 @ 8 * tc
++ vadd.i16 q7, q0 @ 10 * tc
++ vdup.16 d0, r9
++ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering
++ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
++ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
++ vdup.16 d8, r5 @ dp0 + dp3
++ vdup.16 d9, r7 @ dp0' + dp3'
++ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0))
++ vdup.16 d10, r6 @ dq0 + dq3
++ vdup.16 d11, r8 @ dq0' + dq3'
++ vand q7, q0 @ AND block and line masks
++ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
++ vadd.i16 q0, q1, q10 @ p1 + deltap1
++ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
++ vadd.i16 q3, q3, q13 @ q1 + deltaq1
++ vadd.i16 q1, \Q11, q6 @ p0 + delta0
++ vsub.i16 q2, q12, q6 @ q0 - delta0
++ vand q4, q7 @ AND nd_p test with block/line masks
++ vand q5, q7 @ AND nd_q test with block/line masks
++ vbit q10, q0, q4
++ vbit \Q11, q1, q7
++ vbit q12, q2, q7
++ vbit q13, q3, q5
++
++2:
++.if \bit_depth == 8
++ vmovn.i16 d16, q8
++ vmovn.i16 d23, \Q15
++ neg r1, r1
++ vqmovun.s16 d17, q9
++ vqmovun.s16 d18, q10
++ vqmovun.s16 d19, \Q11
++ lsls r10, #31
++ vqmovun.s16 d20, q12
++ vqmovun.s16 d21, q13
++ vqmovun.s16 d22, q14
++.else
++ vmov.i16 q0, #0
++ vmov.i16 q1, #(1 << \bit_depth - 1)
++ @ q8 & q15 should be unaltered and so don't require clipping
++ neg r1, r1
++ vmax.s16 q9, q0
++ vmax.s16 q10, q0
++ vmax.s16 q11, q0
++ vmax.s16 q12, q0
++ vmax.s16 q13, q0
++ vmax.s16 q14, q0
++ lsls r10, #31
++ vmin.s16 q9, q1
++ vmin.s16 q10, q1
++ vmin.s16 q11, q1
++ vmin.s16 q12, q1
++ vmin.s16 q13, q1
++ vmin.s16 q14, q1
++.endif
++ bx lr
++.endm
++
++function hevc_loop_filter_luma_body
++ m_filter_luma 8, q15, q11
++endfunc
++
++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
++@ uint8_t *_pix, [r0]
++@ ptrdiff_t _stride, [r1]
++@ int _beta, [r2]
++@ int *_tc, [r3]
++@ uint8_t *_no_p, [sp+0]
++@ uint8_t *_no_q) [sp+4]
++
++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
++ hevc_loop_filter_luma_start
++
++ sub r4, r0, #4
++ b .Lv_loop_luma_common
++endfunc
++
++@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
++@ uint8_t * pix_r, [r0]
++@ ptrdiff_t _stride, [r1]
++@ int _beta, [r2]
++@ int tc2, [r3]
++@ int no_f, [sp+0]
++@ uint8_t * pix_l) [sp+4]
++
++function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
++ cmp r3, #0
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldr r4, [sp, #36]
++ ldr r10, [sp, #32]
++
++.Lv_loop_luma_common:
++ vpush {d8-d15}
++
++ @ It's slightly faster to do unlaned loads and transpose in the
++ @ 8-bit case, even though it needs more instructions, because
++ @ VLD4.8 is a really slow way to read from memory.
++ vld1.32 {d16[0]}, [r4:32], r1
++ vld1.32 {d20[0]}, [r0:32], r1
++ vld1.32 {d16[1]}, [r4:32], r1
++ vld1.32 {d20[1]}, [r0:32], r1
++ vld1.32 {d17[0]}, [r4:32], r1
++ vld1.32 {d21[0]}, [r0:32], r1
++ vld1.32 {d17[1]}, [r4:32], r1
++ vld1.32 {d21[1]}, [r0:32], r1
++ vld1.32 {d18[0]}, [r4:32], r1
++ vld1.32 {d22[0]}, [r0:32], r1
++ vld1.32 {d18[1]}, [r4:32], r1
++ vld1.32 {d22[1]}, [r0:32], r1
++ vld1.32 {d19[0]}, [r4:32], r1
++ vld1.32 {d23[0]}, [r0:32], r1
++ vld1.32 {d19[1]}, [r4:32]
++ vld1.32 {d23[1]}, [r0:32]
++ vuzp.16 q8, q9
++ vuzp.16 q10, q11
++ vuzp.8 q8, q9
++ vuzp.8 q10, q11
++ vswp d17, d18
++ vswp d21, d22
++
++ bl hevc_loop_filter_luma_body
++
++ add r6, r4, r1
++ add r2, r0, r1
++ lsl r1, #1
++
++ vpop {d8-d15}
++
++ @ no_p[1]
++ bmi 1f
++ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
++ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
++
++ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
++ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
++1:
++ @ no_q[1]
++ bcs 1f
++ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
++ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
++
++ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
++ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
++1:
++ pop {r4-r10,pc}
++
++.Lbypasswrite:
++ vpop {d8-d15}
++ pop {r4-r10,pc}
++endfunc
++
++.macro m_filter_v_luma_16 bit_depth
++ vpush {d8-d15}
++
++ @ Uses slightly fewer instructions to do laned loads than unlaned
++ @ and transpose. This also means that we can use the same code for
++ @ both split & unsplit deblock
++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
++ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++
++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++
++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
++ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
++ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4]
++ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0]
++
++ bl hevc_loop_filter_luma_body_\bit_depth
++
++ add r6, r4, r1
++ add r2, r0, r1
++ lsl r1, #1
++
++ vpop {d8-d15}
++
++ @ p[1]
++ bmi 1f
++ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
++ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
++ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
++ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6]
++1:
++ @ q[1]
++ bcs 1f
++ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
++ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
++ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
++ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2]
++1:
++ pop {r4-r10,pc}
++.endm
++
++
++
++
++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0]
++@ ptrdiff_t stride, [r1]
++@ int beta, [r2]
++@ int32_t *tc, [r3]
++@ uint8_t *no_p, sp[0]
++@ uint8_t *no_q); sp[4]
++@
++@ Src should always be on 8 byte boundry & all in the same slice
++
++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
++ hevc_loop_filter_luma_start
++ b .Lh_loop_filter_luma_common_8
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
++ cmp r3, #0
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldr r10, [sp, #32]
++
++.Lh_loop_filter_luma_common_8:
++ sub r4, r0, r1, lsl #2
++ add r0, r4, r1
++ lsl r1, #1
++ vpush {d8-d15}
++
++ vld1.8 {d16}, [r4], r1
++ vld1.8 {d17}, [r0], r1
++ vld1.8 {d18}, [r4], r1
++ vld1.8 {d19}, [r0], r1
++ vld1.8 {d20}, [r4], r1
++ vld1.8 {d21}, [r0], r1
++ vld1.8 {d22}, [r4]
++ vld1.8 {d23}, [r0]
++
++ bl hevc_loop_filter_luma_body
++
++ add r0, r0, r1, lsl #1
++ add r2, r4, r1, lsl #1
++ add r6, r4, r1, asr #1
++ vpop {d8-d15}
++
++ @ P2-P0
++ bcs 1f
++ vst1.8 {d22}, [r4], r1
++ vst1.8 {d21}, [r6]
++ vst1.8 {d20}, [r4]
++1:
++ @ Q0-Q2
++ bmi 1f
++ vst1.8 {d19}, [r0], r1
++ vst1.8 {d18}, [r2]
++ vst1.8 {d17}, [r0]
++1:
++ pop {r4-r10,pc}
++endfunc
++
++
++.macro m_filter_h_luma_16 bit_depth
++ sub r4, r0, r1, lsl #2
++ add r0, r4, r1
++ lsl r1, #1
++ vpush {d8-d15}
++
++ vld1.16 { q8}, [r4], r1
++ vld1.16 { q9}, [r0], r1
++ vld1.16 {q10}, [r4], r1
++ vld1.16 {q11}, [r0], r1
++ vld1.16 {q12}, [r4], r1
++ vld1.16 {q13}, [r0], r1
++ vld1.16 {q14}, [r4]
++ vld1.16 {q15}, [r0]
++
++ bl hevc_loop_filter_luma_body_\bit_depth
++
++ add r0, r0, r1, lsl #1
++ add r2, r4, r1, lsl #1
++ add r6, r4, r1, asr #1
++ vpop {d8-d15}
++
++ @ P2-P0
++ bcs 1f
++ vst1.16 {q14}, [r4], r1
++ vst1.16 {q13}, [r6]
++ vst1.16 {q12}, [r4]
++1:
++ bmi 1f
++ vst1.16 {q11}, [r0], r1
++ vst1.16 {q10}, [r2]
++ vst1.16 { q9}, [r0]
++1:
++ pop {r4-r10,pc}
++.endm
++
++
++@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r, // r0
++@ unsigned int stride, // r1
++@ uint32_t tc4, // r2
++@ unsigned int no_f); // r3
++@
++@ no_f
++@ 0 tl P0
++@ 1 tr P1
++@ 2 bl Q0
++@ 3 br Q1
++@
++@ Probably not worth having the P/Qa only special case in this direction
++@ Given layout we won't save any memory reads or avoid any cache dirtying
++@ We would save a bit of computation but I expect the partials to be less
++@ common in the H direction than V due to how we arrange deblock.
++
++function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
++ sub r12, r0, r1
++ cmp r2, #0
++ it eq
++ bxeq lr
++ vld1.8 {d26,d27}, [r0]
++ lsl r1, #1
++ sub r0, r1
++ vld1.8 {d18,d19}, [r12], r1
++ vld1.8 {d16,d17}, [r0], r1
++ vld1.8 {d28,d29}, [r12]
++
++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
++ "sub r12, r0, r1, asr #1"
++
++ lsls r3, #29 @ b2 -> N, b3 -> C
++ it pl
++ vstrpl d26, [r0, #0]
++ it cc
++ vstrcc d27, [r0, #8]
++ lsls r3, #2 @ b0 -> N, b1 -> C
++ it pl
++ vstrpl d18, [r12, #0]
++ it cc
++ vstrcc d19, [r12, #8]
++ bx lr
++
++endfunc
++
++
++@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0
++@ unsigned int stride, // r1
++@ uint32_t tc4, // r2
++@ unsigned int no_f); // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@
++@ Macro here actual function near bottom
++
++.macro m_filter_h_uv_16 bit_depth
++ sub r12, r0, r1
++ cmp r2, #0
++ it eq
++ bxeq lr
++ vld1.16 {q12, q13}, [r0]
++ lsl r1, #1
++ sub r0, r1
++ vld1.16 {q10, q11}, [r12], r1
++ vld1.16 {q8, q9 }, [r0], r1
++ vld1.16 {q14, q15}, [r12]
++
++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
++ "sub r12, r0, r1, asr #1", \
++ "cmp r3, #0"
++
++ bne 1f
++ vst1.16 {q10, q11}, [r12]
++ vst1.16 {q12, q13}, [r0]
++ bx lr
++
++ @ At least one no_f bit is set
++ @ Which means we need to break this apart in an ugly fashion
++1:
++ lsls r3, #29 @ b2 -> N, b3 -> C
++ itt pl
++ vstrpl d24, [r0, #0]
++ vstrpl d25, [r0, #8]
++ itt cc
++ vstrcc d26, [r0, #16]
++ vstrcc d27, [r0, #24]
++ lsls r3, #2 @ b0 -> N, b1 -> C
++ itt pl
++ vstrpl d20, [r12, #0]
++ vstrpl d21, [r12, #8]
++ itt cc
++ vstrcc d22, [r12, #16]
++ vstrcc d23, [r12, #24]
++ bx lr
++.endm
++
++
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0
++@ unsigned int stride, // r1
++@ uint32_t tc4, // r2
++@ uint8_t * src_l, // r3
++@ unsigned int no_f); // sp[0]
++@
++@ no_f:
++@ 0 tl P0
++@ 1 tr Q0
++@ 2 bl P1
++@ 3 br Q1
++
++function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
++ cmp r2, #0
++ it eq
++ bxeq lr
++ push {lr}
++ vld2.16 {d16[0], d18[0]}, [r3], r1
++ vld2.16 {d20[0], d22[0]}, [r0], r1
++
++ cmp r2, #0x10000
++ vld2.16 {d16[1], d18[1]}, [r3], r1
++ vld2.16 {d20[1], d22[1]}, [r0], r1
++
++ vld2.16 {d16[2], d18[2]}, [r3], r1
++ vld2.16 {d20[2], d22[2]}, [r0], r1
++
++ vld2.16 {d16[3], d18[3]}, [r3], r1
++ vld2.16 {d20[3], d22[3]}, [r0], r1
++ blo 10f
++
++ vld2.16 {d17[0], d19[0]}, [r3], r1
++ vld2.16 {d21[0], d23[0]}, [r0], r1
++
++ sub ip, r0, r3
++ vld2.16 {d17[1], d19[1]}, [r3], r1
++ vld2.16 {d21[1], d23[1]}, [r0], r1
++
++ cmp ip, #4
++ vld2.16 {d17[2], d19[2]}, [r3], r1
++ vld2.16 {d21[2], d23[2]}, [r0], r1
++
++ vld2.16 {d17[3], d19[3]}, [r3]
++ vld2.16 {d21[3], d23[3]}, [r0]
++
++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
++ "ldr lr, [sp, #4]", \
++ "neg r1, r1", \
++ "it eq; cmpeq lr, #0", \
++ "add r3, #2", \
++ "add ip, r3, r1", \
++ "add r2, r0, r1", \
++ "lsl r1, #1"
++
++ bne 1f
++
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++ vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b
++ vst2.16 {d19[2], d21[2]}, [ip], r1
++ vst2.16 {d19[1], d21[1]}, [r3], r1
++ vst2.16 {d19[0], d21[0]}, [ip], r1
++ vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a
++ vst2.16 {d18[2], d20[2]}, [ip], r1
++ vst2.16 {d18[1], d20[1]}, [r3]
++ vst2.16 {d18[0], d20[0]}, [ip]
++ pop {pc}
++
++@ Either split or partial
++1:
++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++ ittt cs
++ addcs r0, r0, r1, lsl #1
++ addcs r2, r2, r1, lsl #1
++ bcs 1f
++ @ Q0b
++ vst1.16 {d21[3]}, [r0], r1
++ vst1.16 {d21[2]}, [r2], r1
++ vst1.16 {d21[1]}, [r0], r1
++ vst1.16 {d21[0]}, [r2], r1
++1:
++ ittt mi
++ addmi r3, r3, r1, lsl #1
++ addmi ip, ip, r1, lsl #1
++ bmi 1f
++ @ P0b
++ vst1.16 {d19[3]}, [r3], r1
++ vst1.16 {d19[2]}, [ip], r1
++ vst1.16 {d19[1]}, [r3], r1
++ vst1.16 {d19[0]}, [ip], r1
++1:
++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++ bcs 1f
++ @ Q0a
++ vst1.16 {d20[3]}, [r0], r1
++ vst1.16 {d20[2]}, [r2], r1
++ vst1.16 {d20[1]}, [r0]
++ vst1.16 {d20[0]}, [r2]
++1:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.16 {d18[3]}, [r3], r1
++ vst1.16 {d18[2]}, [ip], r1
++ vst1.16 {d18[1]}, [r3]
++ vst1.16 {d18[0]}, [ip]
++ pop {pc}
++
++@ Single lump (rather than double)
++10:
++ @ As we have post inced r0/r3 in the load the easiest thing to do is
++ @ to subtract and write forwards, rather than backwards (as above)
++ @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
++ "ldr lr, [sp, #4]", \
++ "add r3, #2", \
++ "sub r0, r0, r1, lsl #2", \
++ "sub r3, r3, r1, lsl #2", \
++ "lsls lr, #31", \
++ "add r2, r0, r1", \
++ "add ip, r3, r1", \
++ "lsl r1, #1"
++
++ bcs 3f
++ @ Q0a
++ vst1.16 {d20[0]}, [r0], r1
++ vst1.16 {d20[1]}, [r2], r1
++ vst1.16 {d20[2]}, [r0]
++ vst1.16 {d20[3]}, [r2]
++3:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.16 {d18[0]}, [r3], r1
++ vst1.16 {d18[1]}, [ip], r1
++ vst1.16 {d18[2]}, [r3]
++ vst1.16 {d18[3]}, [ip]
++ pop {pc}
++
++endfunc
++
++
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0
++@ unsigned int stride, // r1
++@ uint32_t tc4, // r2
++@ uint8_t * src_l, // r3
++@ unsigned int no_f); // sp[0]
++@
++
++@ no_f
++@ 0 tl P0a
++@ 1 tr Q0a
++@ 2 bl P0b
++@ 3 br Q0b
++
++@ P1: q8, q12
++@ P0: q9, q13
++@ Q0: q10, q14
++@ Q1: q11, q15
++
++.macro m_filter_v_uv2_16 bit_depth
++ cmp r2, #0
++ it eq
++ bxeq lr
++ push {lr}
++ vld2.32 {d16[0], d18[0]}, [r3], r1
++ vld2.32 {d20[0], d22[0]}, [r0], r1
++
++ cmp r2, #0x10000
++ vld2.32 {d16[1], d18[1]}, [r3], r1
++ vld2.32 {d20[1], d22[1]}, [r0], r1
++
++ vld2.32 {d17[0], d19[0]}, [r3], r1
++ vld2.32 {d21[0], d23[0]}, [r0], r1
++
++ vld2.32 {d17[1], d19[1]}, [r3], r1
++ vld2.32 {d21[1], d23[1]}, [r0], r1
++ blo 10f
++
++ vld2.32 {d24[0], d26[0]}, [r3], r1
++ vld2.32 {d28[0], d30[0]}, [r0], r1
++
++ sub ip, r0, r3
++ vld2.32 {d24[1], d26[1]}, [r3], r1
++ vld2.32 {d28[1], d30[1]}, [r0], r1
++
++ cmp ip, #8
++ vld2.32 {d25[0], d27[0]}, [r3], r1
++ vld2.32 {d29[0], d31[0]}, [r0], r1
++
++ vld2.32 {d25[1], d27[1]}, [r3]
++ vld2.32 {d29[1], d31[1]}, [r0]
++
++ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
++ "ldr lr, [sp, #4]", \
++ "neg r1, r1", \
++ "it eq; cmpeq lr, #0", \
++ "add r3, #4", \
++ "add ip, r3, r1", \
++ "add r2, r0, r1", \
++ "lsl r1, #1"
++
++ bne 1f
++
++@ Much/most of the time r0 == r3 + 8 and no_f == 0
++@ so it is worth having this special case
++ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b
++ vst2.32 {d27[0], d29[0]}, [ip], r1
++ vst2.32 {d26[1], d28[1]}, [r3], r1
++ vst2.32 {d26[0], d28[0]}, [ip], r1
++ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a
++ vst2.32 {d19[0], d21[0]}, [ip], r1
++ vst2.32 {d18[1], d20[1]}, [r3]
++ vst2.32 {d18[0], d20[0]}, [ip]
++ pop {pc}
++
++@ Either split or partial
++1:
++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++ ittt cs
++ addcs r0, r0, r1, lsl #1
++ addcs r2, r2, r1, lsl #1
++ bcs 1f
++ @ Q0b
++ vst1.32 {d29[1]}, [r0], r1
++ vst1.32 {d29[0]}, [r2], r1
++ vst1.32 {d28[1]}, [r0], r1
++ vst1.32 {d28[0]}, [r2], r1
++1:
++ ittt mi
++ addmi r3, r3, r1, lsl #1
++ addmi ip, ip, r1, lsl #1
++ bmi 1f
++ @ P0b
++ vst1.32 {d27[1]}, [r3], r1
++ vst1.32 {d27[0]}, [ip], r1
++ vst1.32 {d26[1]}, [r3], r1
++ vst1.32 {d26[0]}, [ip], r1
++1:
++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++ bcs 1f
++ @ Q0a
++ vst1.32 {d21[1]}, [r0], r1
++ vst1.32 {d21[0]}, [r2], r1
++ vst1.32 {d20[1]}, [r0]
++ vst1.32 {d20[0]}, [r2]
++1:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.32 {d19[1]}, [r3], r1
++ vst1.32 {d19[0]}, [ip], r1
++ vst1.32 {d18[1]}, [r3]
++ vst1.32 {d18[0]}, [ip]
++ pop {pc}
++
++@ Single lump (rather than double)
++10:
++ @ As we have post inced r0/r3 in the load the easiest thing to do is
++ @ to subtract and write forwards, rather than backwards (as above)
++ @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \
++ "ldr lr, [sp, #4]", \
++ "add r3, #4", \
++ "sub r0, r0, r1, lsl #2", \
++ "sub r3, r3, r1, lsl #2", \
++ "lsls lr, #31", \
++ "add r2, r0, r1", \
++ "add ip, r3, r1", \
++ "lsl r1, #1"
++
++ bcs 3f
++ @ Q0a
++ vst1.32 {d20[0]}, [r0], r1
++ vst1.32 {d20[1]}, [r2], r1
++ vst1.32 {d21[0]}, [r0]
++ vst1.32 {d21[1]}, [r2]
++3:
++ it mi
++ popmi {pc}
++ @ P0a
++ vst1.32 {d18[0]}, [r3], r1
++ vst1.32 {d18[1]}, [ip], r1
++ vst1.32 {d19[0]}, [r3]
++ vst1.32 {d19[1]}, [ip]
++ pop {pc}
++.endm
++
++
++@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
++@ But in real world testing it is ~20% slower, presumably due to code size
++
++#if 0 // NEON version
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ * int in_inc0, int in_inc1)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++ mov ip, sp
++ push {a1-a3,v1-v8,lr}
++ ldm ip, {v1-v6}
++ cmp a1, #2
++ bls 2f
++ vpush {d8-d13}
++ sub v5, v5, #10
++ sub v6, v6, #10
++1:
++ vld2.32 {d0[0], d2[0]}, [a3]!
++ vld2.32 {d4[0], d6[0]}, [a4]!
++ vmov.u8 q12, #0
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb v8, [a3], #1
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[0]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[0]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d16[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d20[0]}, [ip]
++ vld1.32 {d18[0]}, [v8]
++ vld1.32 {d22[0]}, [lr]
++
++ vld2.32 {d0[1], d2[1]}, [a3]!
++ vld2.32 {d4[1], d6[1]}, [a4]!
++ ldrb a2, [a3], #1
++ vmov.u16 d12, #1
++ ldrb ip, [a4], #1
++ vmov.u16 d13, #2
++ ldrb v8, [a3], #1
++ vmov.u16 d27, #4
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[2]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[2]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d16[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d20[1]}, [ip]
++ vld1.32 {d18[1]}, [v8]
++ vld1.32 {d22[1]}, [lr]
++
++ vld2.32 {d1[0], d3[0]}, [a3]!
++ vld2.32 {d5[0], d7[0]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb lr, [a4], #1
++ ldrb v8, [a3], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[4]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[4]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d17[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d21[0]}, [ip]
++ vld1.32 {d19[0]}, [v8]
++ vld1.32 {d23[0]}, [lr]
++
++ vld2.32 {d1[1], d3[1]}, [a3]!
++ vld2.32 {d5[1], d7[1]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb v8, [a3], #1
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d24[6]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d25[6]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d17[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d21[1]}, [ip]
++ vld1.32 {d19[1]}, [v8]
++ vld1.32 {d23[1]}, [lr]
++
++ @ So now we have:
++ @ q0.32[i] = curr[i].mv[0]
++ @ q1.32[i] = curr[i].mv[1]
++ @ q2.32[i] = neigh[i].mv[0]
++ @ q3.32[i] = neigh[i].mv[1]
++ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++ @ d24.16[i] = curr[i].pred_flag
++ @ d25.16[i] = neigh[i].pred_flag
++
++ vtst.16 d28, d24, d12
++ vtst.16 d29, d24, d13
++ vadd.i16 d8, d24, d12
++ vadd.i16 d9, d25, d12
++ vtst.16 d30, d25, d12
++ vtst.16 d31, d25, d13
++ veor d26, d8, d9
++ ldr lr, [sp, 6*8 + 1*4]
++ vmovl.s16 q4, d28
++ vmovl.s16 q5, d29
++ teq lr, #1
++ vmovl.s16 q14, d30
++ it ne
++ lslne v1, lr, #1
++ vmovl.s16 q15, d31
++ it ne
++ rsbne v2, v1, #32
++ vbif q0, q1, q4
++ vbif q2, q3, q14
++ vbif q1, q0, q5
++ vbif q3, q2, q15
++ vabd.s16 q12, q0, q2
++ vabd.s16 q2, q1
++ vabd.s16 q0, q3
++ vabd.s16 q1, q3
++ vbif q8, q9, q4
++ vbif q10, q11, q14
++ vbif q9, q8, q5
++ vbif q11, q10, q15
++ vclt.u16 d6, d24, d27
++ vclt.u16 d8, d2, d27
++ vclt.u16 d7, d25, d27
++ vclt.u16 d9, d3, d27
++ vclt.u16 d2, d0, d27
++ vclt.u16 d0, d4, d27
++ vclt.u16 d3, d1, d27
++ vclt.u16 d1, d5, d27
++ vceq.i32 q12, q10, q8
++ vceq.i32 q10, q9
++ vceq.i32 q8, q11
++ vceq.i32 q9, q11
++ vshrn.i32 d6, q3, #8
++ vshrn.i32 d7, q4, #8
++ vshrn.i32 d8, q1, #8
++ vshrn.i32 d9, q0, #8
++ vmovn.i32 d4, q12
++ vmovn.i32 d2, q10
++ vmovn.i32 d3, q8
++ vmovn.i32 d5, q9
++ vand q2, q3
++ vrev16.8 q3, q3
++ vand q2, q3
++ vand q1, q4
++ vrev16.8 q4, q4
++ vand q1, q4
++ vand d4, d5
++ vand d2, d3
++ vbic d0, d12, d4
++ vshr.u16 d26, #2
++ vbic d0, d2
++ vmov.i16 d1, #0x5555
++ vorr d0, d26
++ bne 10f
++
++ @ Merge results into result word, no duplicates
++ vmov a2, s0
++ vmov v8, s1
++ vmov.u16 ip, d0[1]
++ vmov.u16 lr, d0[3]
++ lsl a2, #30
++ lsl v8, #30
++ lsl ip, #30
++ lsl lr, #30
++ orr a2, ip, a2, lsr #2
++ orr v8, lr, v8, lsr #2
++ orr a2, v8, a2, lsr #4
++ subs a1, #4
++ orr v7, a2, v7, lsr #8
++ bhi 1b
++
++ mov a1, #32
++ ldr a3, [sp, #6*8]
++ vpop {d8-d13}
++ sub a1, a1, a3, lsl #1
++ mov a1, v7, lsr a1
++ pop {a2-a4,v1-v8,pc}
++10:
++ @ Merge results into result word, with duplicates
++ vmul.i16 d0, d1
++ vmov a2, s0
++ vmov v8, s1
++ vmov.u16 ip, d0[1]
++ vmov.u16 lr, d0[3]
++ lsl a2, v2
++ subs a1, #4
++ lsl v8, v2
++ lsl ip, v2
++ lsl lr, v2
++ ldr v2, [sp, #6*8 + 12*4 + 1*4]
++T lsr a2, v1
++T orr a2, ip, a2
++A orr a2, ip, a2, lsr v1
++ lsl ip, v1, #1
++T lsr v8, v1
++T orr v8, lr, v8
++A orr v8, lr, v8, lsr v1
++ lsl lr, v1, #2
++T lsr a2, ip
++T orr a2, v8, a2
++A orr a2, v8, a2, lsr ip
++ ldr v1, [sp, #6*8 + 12*4]
++T lsr v7, lr
++T orr v7, a2, v7
++A orr v7, a2, v7, lsr lr
++ bhi 1b
++
++ mov a1, #32
++ ldrd a3, a4, [sp, #6*8]
++ vpop {d8-d13}
++ mls a1, a3, a4, a1
++ mls a1, a3, a4, a1
++ mov a1, v7, lsr a1
++ pop {a2-a4,v1-v8,pc}
++
++
++2:
++ sub v5, v5, #10
++ sub v6, v6, #10
++ vmov.u8 d16, #0
++ blo 3f
++ vld2.32 {d0[0], d1[0]}, [a3]!
++ vld2.32 {d2[0], d3[0]}, [a4]!
++ ldrb a2, [a3], #1
++ ldrb ip, [a4], #1
++ ldrb lr, [a4], #1
++ ldrb v8, [a3], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d16[0]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d16[4]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d4[0]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d5[0]}, [ip]
++ vld1.32 {d6[0]}, [v8]
++ vld1.32 {d7[0]}, [lr]
++
++3:
++ vld2.32 {d0[1], d1[1]}, [a3]!
++ vld2.32 {d2[1], d3[1]}, [a4]!
++ ldrb a2, [a3], #1
++ vmov.u16 d17, #1
++ ldrb ip, [a4], #1
++ vmov.u16 d18, #2
++ ldrb v8, [a3], #1
++ vmov.u16 d19, #4
++ ldrb lr, [a4], #1
++ add a2, v1, a2, lsl #2
++ vld1.8 {d16[2]}, [a3], v5
++ add ip, v3, ip, lsl #2
++ vld1.8 {d16[6]}, [a4], v6
++ add v8, v2, v8, lsl #2
++ vld1.32 {d4[1]}, [a2]
++ add lr, v4, lr, lsl #2
++ vld1.32 {d5[1]}, [ip]
++ vld1.32 {d6[1]}, [v8]
++ vld1.32 {d7[1]}, [lr]
++
++ @ So now we have:
++ @ d0.32[i] = curr[i].mv[0]
++ @ d1.32[i] = curr[i].mv[1]
++ @ d2.32[i] = neigh[i].mv[0]
++ @ d3.32[i] = neigh[i].mv[1]
++ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++ @ d16.16[i] = curr[i].pred_flag
++ @ d16.16[2+i] = neigh[i].pred_flag
++
++ vtst.16 d20, d16, d17
++ vtst.16 d22, d16, d18
++ vadd.i16 d30, d16, d17
++ vswp d2, d3
++ ldr lr, [sp, #1*4]
++ vmovl.s16 q10, d20
++ teq lr, #1
++ vmovl.s16 q11, d22
++ it ne
++ lslne v1, lr, #1
++ vbif d0, d1, d20
++ vbif d4, d6, d20
++ vbif d3, d2, d21
++ vbif d5, d7, d21
++ vbif d1, d0, d22
++ vbif d6, d4, d22
++ vbif d2, d3, d23
++ vbif d7, d5, d23
++ vshr.u16 d30, #2
++ vabd.s16 d24, d0, d3
++ vabd.s16 d25, d1, d2
++ vabd.s16 q0, q0, q1
++ vceq.i32 d2, d4, d5
++ vceq.i32 d20, d5, d6
++ vceq.i32 d21, d4, d7
++ vceq.i32 d3, d6, d7
++ vclt.u16 d6, d24, d19
++ vclt.u16 d7, d25, d19
++ vclt.u16 d22, d1, d19
++ vclt.u16 d23, d0, d19
++ vshrn.i32 d6, q3, #8
++ vmovn.i32 d2, q1
++ vshrn.i32 d7, q11, #8
++ vmovn.i32 d3, q10
++ vand q0, q3, q1
++ it ne
++ rsbne v2, v1, #32
++ vrev16.8 q3, q3
++ vand q0, q3
++ vsra.u64 d30, #32
++ vshr.u64 q1, q0, #32
++ vand q0, q1
++ vbic d0, d17, d0
++ vand d30, d30, d17
++ vbic d0, d1
++ vmov.i16 d1, #0x5555
++ vorr d0, d30
++ bne 10f
++
++ @ Construct result word, no duplicates
++ cmp a1, #2
++ vmov.u16 a1, d0[1]
++ vmov.u16 a2, d0[0]
++ it eq
++ orreq a1, a2, a1, lsl #2
++ pop {a2-a4,v1-v8,pc}
++10:
++ @ Construct result word, with duplicates
++ cmp a1, #2
++ vmul.i16 d0, d1
++ vmov a2, s0
++ vmov.u16 a1, d0[1]
++ lsl a2, #16
++ pkhbt a1, a1, a1, lsl #16
++ lsr a2, v2
++ lsr a1, v2
++T itt eq
++T lsleq a1, v1
++T orreq a1, a2, a1
++A orreq a1, a2, a1, lsl v1
++ pop {a2-a4,v1-v8,pc}
++endfunc
++
++
++
++#else // non-NEON version
++
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ * int in_inc0, in_inc1)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++ add ip, sp, #4*4
++ push {a2-a4,v1-v8,lr}
++ mov v6, #32
++1: ldmdb ip, {v1-v4}
++ ldrsb v5, [a3, #8] @ curr->ref_idx
++ ldrsb v8, [a3, #9]
++ ldrsb ip, [a4, #8] @ neigh->ref_idx
++ ldrsb lr, [a4, #9]
++ ldr v1, [v1, v5, lsl #2]
++ ldrb v5, [a3, #10] @ curr->pred_flag
++ ldr v2, [v2, v8, lsl #2]
++ ldrb v8, [a4, #10] @ neigh->pred_flag
++ ldr v3, [v3, ip, lsl #2]
++ ldr v4, [v4, lr, lsl #2]
++ teq v5, #3
++ beq 20f
++ teq v8, #3
++ beq 90f
++
++ tst v5, #1
++ itee ne
++ ldrne v5, [a3, #0] @ curr->mv[0]
++ moveq v1, v2
++ ldreq v5, [a3, #4] @ curr->mv[1]
++ tst v8, #1
++ itee ne
++ ldrne v8, [a4, #0] @ neigh->mv[0]
++ moveq v3, v4
++ ldreq v8, [a4, #4] @ neigh->mv[1]
++ teq v1, v3
++ bne 10f
++ ldr lr, =0xFFFCFFFC
++ ssub16 ip, v8, v5
++ ssub16 v5, v5, v8
++ sel v5, v5, ip
++ ands v5, v5, lr
++ @ drop through
++10: it ne
++ movne v5, #1<<30
++11:
++ sub v6, v6, #2
++T mov v7, v7, lsr #2
++ subs a2, a2, #1
++A orr v7, v5, v7, lsr #2
++T orr v7, v5, v7
++ bhi 11b
++
++ ldrd v3, v4, [sp, #16*4]
++ ldr a2, [sp]
++ add ip, sp, #16*4
++ subs a1, a1, #1
++ add a3, a3, v3
++ add a4, a4, v4
++ bhi 1b
++ mov a1, v7, lsr v6
++ pop {a2-a4,v1-v8,pc}
++
++20: teq v8, #3
++ bne 10b
++
++ teq v1, v3
++ it eq
++ teqeq v2, v4
++ bne 40f
++ teq v1, v2
++ bne 30f
++
++ ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
++ ldr lr, =0xFFFCFFFC
++ ssub16 ip, v3, v1
++ ssub16 v5, v1, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
++ bne 25f
++ ssub16 ip, v4, v2
++ ssub16 v5, v2, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
++ beq 11b
++ @ drop through
++25: ssub16 ip, v4, v1
++ ssub16 v5, v1, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
++ bne 10b
++ ssub16 ip, v3, v2
++ ssub16 v5, v2, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
++ b 10b
++
++30: ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
++ ldr lr, =0xFFFCFFFC
++ ssub16 ip, v3, v1
++ ssub16 v5, v1, v3
++ sel v5, v5, ip
++ ands v5, v5, lr
++ bne 10b
++ ssub16 ip, v4, v2
++ ssub16 v5, v2, v4
++ sel v5, v5, ip
++ ands v5, v5, lr
++ b 10b
++
++40: teq v1, v4
++ ite eq
++ teqeq v2, v3
++ bne 10b
++
++ ldrd v1, v2, [a3] @ curr->mv
++ ldrd v3, v4, [a4] @ neigh->mv
++ ldr lr, =0xFFFCFFFC
++ b 25b
++
++90:
++ mov v5, #1<<30
++ b 11b
++endfunc
++
++
++#endif
++
++
++@ =============================================================================
++@
++@ 10 bit
++
++function hevc_loop_filter_luma_body_10
++ m_filter_luma 10, q11, q15
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
++ hevc_loop_filter_luma_start
++ b .Lh_loop_luma_common_10
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
++ cmp r3, #0
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldr r10, [sp, #32]
++.Lh_loop_luma_common_10:
++ m_filter_h_luma_16 10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
++ hevc_loop_filter_luma_start
++ sub r4, r0, #8
++ b .Lv_loop_luma_common_10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
++ cmp r3, #0
++ it eq
++ bxeq lr
++ push {r4-r10,lr} @ 32 bytes
++ ldr r4, [sp, #36]
++ ldr r10, [sp, #32]
++
++.Lv_loop_luma_common_10:
++ m_filter_v_luma_16 10
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
++ m_filter_h_uv_16 10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
++ m_filter_v_uv2_16 10
++endfunc
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
+@@ -0,0 +1,184 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++/* uses registers q8 - q13 for temp values */
++.macro tr4_luma_shift shift
++ vaddl.s16 q8, d28, d30 // c0 = src0 + src2
++ vaddl.s16 q9, d30, d31 // c1 = src2 + src3
++ vsubl.s16 q10, d28, d31 // c2 = src0 - src3
++ vaddl.s16 q11, d28, d31 // src0 + src3
++
++ vmul.i32 q12, q8, d1[0] // 29 * c0
++ vmul.i32 q13, q10, d2[0] // 55 * c2
++ vmul.i32 q8, q8, d2[0] // 55 * c0
++ vmull.s16 q14, d29, d0[0] // c3 = 74 * src1
++
++ vsubw.s16 q11, q11, d30 // src0 - src2 + src3
++ vmla.i32 q12, q9, d2[0] // 29 * c0 + 55 * c1
++ vmls.i32 q13, q9, d1[0] // 55 * c2 - 29 * c1
++ vmla.i32 q8, q10, d1[0] // 55 * c0 + 29 * c2
++
++ vmul.i32 q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
++ vadd.i32 q12, q12, q14 // dst0 = 29 * c0 + 55 * c1 + c3
++ vadd.i32 q13, q13, q14 // dst1 = 55 * c2 - 29 * c1 + c3
++ vsub.i32 q8, q8, q14 // dst3 = 55 * c0 + 29 * c2 - c3
++
++ vqrshrn.s32 d28, q12, \shift
++ vqrshrn.s32 d29, q13, \shift
++ vqrshrn.s32 d30, q11, \shift
++ vqrshrn.s32 d31, q8, \shift
++.endm
++
++/* uses registers q8 - q11 for temp values */
++.macro tr4_shift shift
++ vmull.s16 q9, d29, d0[0] // 83 * src1
++ vmull.s16 q8, d29, d0[1] // 36 * src1
++ vshll.s16 q14, d28, #6 // 64 * src0
++ vshll.s16 q10, d30, #6 // 64 * src2
++ vmlal.s16 q9, d31, d0[1] // 83 * src1 + 36 * src3 o0
++ vmlsl.s16 q8, d31, d0[0] // 36 * src1 - 83 * src3 o1
++ vadd.s32 q11, q14, q10 // 64 * (src0 + src2) e0
++ vsub.s32 q10, q14, q10 // 64 * (src0 - src2) e1
++ vadd.s32 q14, q11, q9 // e0 + o0
++ vadd.s32 q15, q10, q8 // e1 + o1
++ vsub.s32 q8, q10, q8 // e1 - o1
++ vsub.s32 q9, q11, q9 // e0 - o0
++
++ vqrshrn.s32 d28, q14, \shift
++ vqrshrn.s32 d29, q15, \shift
++ vqrshrn.s32 d30, q8, \shift
++ vqrshrn.s32 d31, q9, \shift
++.endm
++
++.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7, \
++ tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
++ tmp1, /* Q reg which doesn't alias with d7 or d0 */ \
++ shift, I1, I2, I3
++
++ vmull.s16 q4, \d1, d1[1] // 89 * src1
++ \I1
++ vmull.s16 q5, \d1, d1[0] // 75 * src1
++ \I2
++ vmull.s16 q6, \d1, d1[3] // 50 * src1
++ \I3
++ vmull.s16 q7, \d1, d1[2] // 18 * src1
++ vmlal.s16 q4, \d3, d1[0] // 75 * src3
++ vmlsl.s16 q5, \d3, d1[2] //-18 * src3
++ vmlsl.s16 q6, \d3, d1[1] //-89 * src3
++ vmlsl.s16 q7, \d3, d1[3] //-50 * src3
++
++ // tr4
++ vmull.s16 q1, \d2, d0[0] // 83 * src(1*2)
++ vmull.s16 q2, \d2, d0[1] // 36 * src(1*2)
++
++ vmlal.s16 q4, \d5, d1[3] // 50 * src5
++ vmlsl.s16 q5, \d5, d1[1] //-89 * src5
++ vmlal.s16 q6, \d5, d1[2] // 18 * src5
++ vmlal.s16 q7, \d5, d1[0] // 75 * src5
++
++ vshll.s16 q3, \d0, #6 // 64 * src(0*2)
++ vshll.s16 \tmp0, \d4, #6 // 64 * src(2*2)
++ vmlal.s16 q1, \d6, d0[1] // 83 * src(1*2) + 36 * src(3*2) o0
++ vmlsl.s16 q2, \d6, d0[0] // 36 * src(1*2) - 83 * src(3*2) o1
++ vadd.i32 \tmp1, q3, \tmp0 // 64 * (src(0*2) + src(2*2)) e0
++ vsub.i32 \tmp0, q3, \tmp0 // 64 * (src(0*2) - src(2*2)) e1
++
++ vmlal.s16 q4, \d7, d1[2] // 18 * src7
++ vmlsl.s16 q5, \d7, d1[3] //-50 * src7
++ vmlal.s16 q6, \d7, d1[0] // 75 * src7
++ vmlsl.s16 q7, \d7, d1[1] //-89 * src7
++
++ vsub.i32 q3, \tmp1, q1 // e0 - o0
++ vadd.i32 \tmp1, \tmp1, q1 // e0 + o0
++ vadd.i32 q1, \tmp0, q2 // e1 + o1
++ vsub.i32 q2, \tmp0, q2 // e1 - o1
++
++ vadd.i32 \tmp0, \tmp1, q4 // e_8[0] + o_8[0], dst[0]
++ vsub.i32 q4, \tmp1, q4 // e_8[0] - o_8[0], dst[7]
++ vsub.i32 \tmp1, q3, q7 // e_8[3] - o_8[3], dst[4]
++ vadd.i32 q7, q3, q7 // e_8[3] + o_8[3], dst[3]
++ vadd.i32 q3, q1, q5 // e_8[1] + o_8[1], dst[1]
++ vsub.i32 q5, q1, q5 // e_8[1] - o_8[1], dst[6]
++ vsub.i32 q1, q2, q6 // e_8[2] - o_8[2], dst[5]
++ vadd.i32 q6, q2, q6 // e_8[2] + o_8[2], dst[2]
++ vqrshrn.s32 \d0, \tmp0, #\shift
++ vqrshrn.s32 \d4, \tmp1, #\shift
++ vqrshrn.s32 \d1, q3, #\shift
++ vqrshrn.s32 \d5, q1, #\shift
++ vqrshrn.s32 \d2, q6, #\shift
++ vqrshrn.s32 \d6, q5, #\shift
++ vqrshrn.s32 \d3, q7, #\shift
++ vqrshrn.s32 \d7, q4, #\shift
++.endm
++
++.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
++ vld1.16 {\d0}, [r0 :64], r3
++ vld1.16 {\d1}, [r2 :64], r3
++ vld1.16 {\d2}, [r0 :64], r3
++ vld1.16 {\d3}, [r2 :64], r3
++ vld1.16 {\d4}, [r0 :64], r3
++ vld1.16 {\d5}, [r2 :64], r3
++ vld1.16 {\d6}, [r0 :64], r3
++ vld1.16 {\d7}, [r2 :64], r3
++
++ tr8_process \
++ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
++ \q01, \q23, 7, "\I1", "\I2", "\I3"
++.endm
++
++.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
++ tr8_process \
++ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
++ \q01, \q23, \shift
++
++ vzip.16 \d0, \d4
++ vzip.16 \d1, \d5
++ vzip.16 \d2, \d6
++ vzip.16 \d3, \d7
++ vst4.16 {\d0-\d3}, [r0 :128], r3
++ vst4.16 {\d4-\d7}, [r2 :128], r3
++.endm
++
++#define BIT_DEPTH 8
++#include "rpi_hevc_idct_fn_neon.S"
++
++.text
++
++.align 4
++tr4f:
++.word 0x00240053 // 36 and d1[0] = 83
++.word 0x00000000
++tr8f:
++.word 0x0059004b // 89, d0[0] = 75
++.word 0x00320012 // 50, d0[2] = 18
++tr16:
++.word 0x005a0057 // 90, d2[0] = 87
++.word 0x00500046 // 80, d2[2] = 70
++.word 0x0039002b // 57, d2[0] = 43
++.word 0x00190009 // 25, d2[2] = 9
++
++#undef BIT_DEPTH
++#define BIT_DEPTH 10
++#include "rpi_hevc_idct_fn_neon.S"
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
+@@ -0,0 +1,32 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++
++av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
++{
++ int cpu_flags = av_get_cpu_flags();
++
++ if (have_neon(cpu_flags))
++ ff_hevcdsp_rpi_init_neon(c, bit_depth);
++}
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
+@@ -0,0 +1,467 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++#include "libavcodec/avcodec.h"
++#include "libavcodec/bit_depth_template.c"
++
++// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
++// have been removed from head as we never use them.
++
++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++ uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
++ unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++ uint8_t * src_l,
++ unsigned int no_f);
++
++void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++ uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
++ unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++ uint8_t * src_l,
++ unsigned int no_f);
++
++void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
++
++void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
++
++void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
++ ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
++ ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height);
++
++void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++
++void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++
++void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++
++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ int in_inc0, int in_inc1);
++void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
++
++
++static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++ ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
++}
++static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++ ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
++}
++
++static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++ ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++ ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++ ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++ ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++
++#if SAO_FILTER_N == 6
++static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++ ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
++}
++static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++ ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
++}
++
++static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++ ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++ ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++ ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++ ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++
++static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++ ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++ int eo, int width, int height)
++{
++ ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++ ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++
++static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height)
++{
++ ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++ ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
++}
++static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height)
++{
++ ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++ ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
++}
++#endif
++
++
++
++#if RPI_HEVC_SAO_BUF_STRIDE != 160
++#error SAO edge src stride not 160 - value used in .S
++#endif
++
++av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
++{
++ if (bit_depth == 8) {
++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8;
++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_8;
++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_8;
++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_8;
++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_8;
++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_8;
++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_8;
++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_8;
++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_8;
++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_8;
++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_8;
++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_8;
++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_8;
++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_8;
++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_8;
++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_8;
++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_8;
++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_8;
++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_8;
++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_8;
++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_8;
++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_8;
++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_8;
++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_8;
++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_8;
++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_8;
++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_8;
++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_8;
++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_8;
++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_8;
++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_8;
++#if SAO_FILTER_N == 6
++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_8;
++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_8;
++#endif
++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_8;
++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_8;
++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_8;
++
++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_8;
++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_8;
++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_8;
++
++#if SAO_FILTER_N == 6
++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_8;
++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_8;
++#endif
++ }
++ else if (bit_depth == 10) {
++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_10;
++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_10;
++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_10;
++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_10;
++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_10;
++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_10;
++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_10;
++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_10;
++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_10;
++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_10;
++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_10;
++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_10;
++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_10;
++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_10;
++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_10;
++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_10;
++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_10;
++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_10;
++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_10;
++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_10;
++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_10;
++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_10;
++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_10;
++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_10;
++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_10;
++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_10;
++
++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_10;
++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_10;
++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_10;
++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_10;
++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_10;
++#if SAO_FILTER_N == 6
++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_10;
++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_10;
++#endif
++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_10;
++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_10;
++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_10;
++
++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_10;
++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_10;
++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_10;
++
++#if SAO_FILTER_N == 6
++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_10;
++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_10;
++#endif
++ }
++
++ assert(offsetof(HEVCRpiMvField, mv) == 0);
++ assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
++ assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
++ c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
++ c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
++}
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
+@@ -0,0 +1,620 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++ .arch_extension mp @ enable PLDW
++
++#define BIT_DEPTH 10
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++ vmax.s16 \Q0, \Q_MIN
++ vmax.s16 \Q1, \Q_MIN
++ vmax.s16 \Q2, \Q_MIN
++ vmax.s16 \Q3, \Q_MIN
++ vmin.s16 \Q0, \Q_MAX
++ vmin.s16 \Q1, \Q_MAX
++ vmin.s16 \Q2, \Q_MAX
++ vmin.s16 \Q3, \Q_MAX
++.endm
++
++@ add_residual4x4(
++@ uint16_t *_dst, [r0]
++@ int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
++ add ip, r0, r2
++ vld1.16 {q10, q11}, [r1]
++ lsl r2, #1
++ vld1.16 {d0}, [r0 :64], r2
++ vld1.16 {d1}, [ip :64], r2
++ vld1.16 {d2}, [r0 :64]
++ vld1.16 {d3}, [ip :64]
++ sub r0, r2
++ vqadd.s16 q0, q10
++ sub ip, r2
++ vqadd.s16 q1, q11
++ vmov.i16 q8, #0
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ vmax.s16 q0, q0, q8
++ vmax.s16 q1, q1, q8
++ vmin.s16 q0, q0, q9
++ vmin.s16 q1, q1, q9
++ vst1.16 {d0}, [r0 :64], r2
++ vst1.16 {d1}, [ip :64], r2
++ vst1.16 {d2}, [r0 :64]
++ vst1.16 {d3}, [ip :64]
++ bx lr
++
++endfunc
++
++@ add_residual4x4_dc(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
++ add ip, r0, r1
++ vdup.16 q15, r2
++ lsl r1, #1
++ vld1.16 {d0}, [r0 :64], r1
++ vld1.16 {d1}, [ip :64], r1
++ vld1.16 {d2}, [r0 :64]
++ vld1.16 {d3}, [ip :64]
++ sub r0, r1
++ vqadd.s16 q0, q15
++ sub ip, r1
++ vqadd.s16 q1, q15
++ vmov.i16 q8, #0
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ vmax.s16 q0, q0, q8
++ vmax.s16 q1, q1, q8
++ vmin.s16 q0, q0, q9
++ vmin.s16 q1, q1, q9
++ vst1.16 {d0}, [r0 :64], r1
++ vst1.16 {d1}, [ip :64], r1
++ vst1.16 {d2}, [r0 :64]
++ vst1.16 {d3}, [ip :64]
++ bx lr
++
++endfunc
++
++
++@ add_residual8x8(
++@ uint16_t *_dst, [r0]
++@ int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
++ mov r3, #8
++ vmov.i64 q8, #0
++ add ip, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r2, #1
++1:
++ vldm r1!, {q10-q13}
++ vld1.16 {q0}, [r0 :128], r2
++ vld1.16 {q1}, [ip :128], r2
++ vld1.16 {q2}, [r0 :128]
++ vld1.16 {q3}, [ip :128]
++ sub r0, r2
++ vqadd.s16 q0, q10
++ sub ip, r2
++ vqadd.s16 q1, q11
++ subs r3, #4
++ vqadd.s16 q2, q12
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0}, [r0 :128], r2
++ vst1.16 {q1}, [ip :128], r2
++ vst1.16 {q2}, [r0 :128], r2
++ vst1.16 {q3}, [ip :128], r2
++ bne 1b
++ bx lr
++
++endfunc
++
++@ add_residual4x4_dc_c(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc_uv) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
++ mov r3, #4
++ vdup.32 q15, r2
++ b 9f
++endfunc
++
++@ add_residual8x8_dc(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r2
++ mov r3, #8
++9:
++ vmov.i16 q8, #0
++ add ip, r0, r1
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r1, #1
++1:
++ vld1.16 {q0}, [r0 :128], r1
++ vld1.16 {q1}, [ip :128], r1
++ vld1.16 {q2}, [r0 :128]
++ vld1.16 {q3}, [ip :128]
++ sub r0, r1
++ vqadd.s16 q0, q15
++ sub ip, r1
++ vqadd.s16 q1, q15
++ subs r3, #4
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0}, [r0 :128], r1
++ vst1.16 {q1}, [ip :128], r1
++ vst1.16 {q2}, [r0 :128], r1
++ vst1.16 {q3}, [ip :128], r1
++ bne 1b
++ bx lr
++
++endfunc
++
++@ add_residual16x16(
++@ uint16_t *_dst, [r0]
++@ int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
++ add ip, r0, r2
++ vmov.i16 q8, #0
++ lsl r2, #1
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ mov r3, #16
++1:
++ vldm r1!, {q10-q13}
++ @ For RPI Sand we could guarantee :256 but not for general
++ @ non-RPI allocation. :128 is as good as we can claim
++ vld1.16 {q0, q1}, [r0 :128]
++ subs r3, #2
++ vld1.16 {q2, q3}, [ip :128]
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q11
++ vqadd.s16 q2, q12
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0, q1}, [r0 :128], r2
++ vst1.16 {q2, q3}, [ip :128], r2
++ bne 1b
++ bx lr
++endfunc
++
++@ add_residual8x8_dc_c(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc_uv) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
++ mov r3, #8
++ vdup.32 q15, r2
++ b 9f
++endfunc
++
++@ add_residual16x16_dc(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
++ vdup.i16 q15, r2
++ mov r3, #16
++9:
++ vmov.i16 q8, #0
++ add ip, r0, r1
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r1, #1
++1:
++ @ For RPI Sand we could guarantee :256 but not for general
++ @ non-RPI allocation. :128 is as good as we can claim
++ vld1.16 {q0, q1}, [r0 :128]
++ subs r3, #2
++ vqadd.s16 q0, q15
++ vqadd.s16 q1, q15
++ vld1.16 {q2, q3}, [ip :128]
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0, q1}, [r0 :128], r1
++ vst1.16 {q2, q3}, [ip :128], r1
++ bne 1b
++ bx lr
++
++endfunc
++
++
++@ add_residual32x32(
++@ uint16_t *_dst, [r0]
++@ int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
++ push {lr}
++ mov r3, #32
++ vmov.i16 q8, #0
++ add lr, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++1:
++ vldm r1!, {q10-q13}
++ vldm r0, {q0-q3}
++ vqadd.s16 q0, q10
++ pldw [lr]
++ vqadd.s16 q1, q11
++ add lr, r2
++ vqadd.s16 q2, q12
++ subs r3, #1
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0-q1}, [r0], r2
++ vst1.16 {q2-q3}, [ip], r2
++ bne 1b
++ pop {pc}
++
++endfunc
++
++@ add_residual16x16_dc_c(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc_uv) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
++ mov r3, #16
++ vdup.32 q15, r2
++ b 9f
++endfunc
++
++@ add_residual32x32_dc(
++@ uint16_t *_dst, [r0]
++@ ptrdiff_t stride, [r1]
++@ int dc) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r2
++ mov r3, #32
++9:
++ vmov.i16 q8, #0
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++1:
++ vldm r0, {q0-q3}
++ vqadd.s16 q0, q15
++ subs r3, #1
++ vqadd.s16 q1, q15
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst1.16 {q0-q1}, [r0], r1
++ vst1.16 {q2-q3}, [ip], r1
++ bne 1b
++ bx lr
++
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_u(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ vld1.16 {q10, q11}, [r1 :256]
++ lsl r2, #1
++ vld2.16 {d0, d2}, [r0 :128], r2
++ vld2.16 {d1, d3}, [ip :128], r2
++ vld2.16 {d4, d6}, [r0 :128]
++ vld2.16 {d5, d7}, [ip :128]
++ sub r0, r2
++ vmov.i16 q8, #0
++ sub ip, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q15
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++
++ vst2.16 {d0, d2}, [r0 :128], r2
++ vst2.16 {d1, d3}, [ip :128], r2
++ vst2.16 {d4, d6}, [r0 :128]
++ vst2.16 {d5, d7}, [ip :128]
++ bx lr
++endfunc
++
++@ add_residual8x8_u(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r3
++ mov r3, #8
++ vmov.i16 q8, #0
++ add ip, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r2, #1
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ subs r3, #2
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q15
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ bx lr
++endfunc
++
++@ add_residual16x16_u(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
++ push {lr}
++ vdup.16 q15, r3
++ mov r3, #16
++ vmov.i16 q8, #0
++ add lr, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vqadd.s16 q0, q10
++ pldw [lr]
++ vqadd.s16 q1, q15
++ add lr, r2
++ vqadd.s16 q2, q11
++ subs r3, #1
++ vqadd.s16 q3, q15
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ pop {pc}
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ vld1.16 {q10, q11}, [r1 :256]
++ lsl r2, #1
++ vld2.16 {d0, d2}, [r0 :128], r2
++ vld2.16 {d1, d3}, [ip :128], r2
++ vld2.16 {d4, d6}, [r0 :128]
++ vld2.16 {d5, d7}, [ip :128]
++ sub r0, r2
++ vmov.i16 q8, #0
++ sub ip, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++
++ vqadd.s16 q0, q15
++ vqadd.s16 q1, q10
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q11
++ clip16_4 q0, q1, q2, q3, q8, q9
++
++ vst2.16 {d0, d2}, [r0 :128], r2
++ vst2.16 {d1, d3}, [ip :128], r2
++ vst2.16 {d4, d6}, [r0 :128]
++ vst2.16 {d5, d7}, [ip :128]
++ bx lr
++endfunc
++
++@ add_residual8x8_v(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
++ vdup.16 q15, r3
++ mov r3, #8
++ vmov.i16 q8, #0
++ add ip, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r2, #1
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ subs r3, #2
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vqadd.s16 q0, q15
++ vqadd.s16 q1, q10
++ vqadd.s16 q2, q15
++ vqadd.s16 q3, q11
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ bx lr
++endfunc
++
++@ add_residual16x16_v(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc) [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
++ push {lr}
++ vdup.16 q15, r3
++ mov r3, #16
++ vmov.i16 q8, #0
++ add lr, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vqadd.s16 q0, q15
++ pldw [lr]
++ vqadd.s16 q1, q10
++ add lr, r2
++ vqadd.s16 q2, q15
++ subs r3, #1
++ vqadd.s16 q3, q11
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ pop {pc}
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
++ vmov.i16 q8, #0
++ add ip, r0, r2
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ lsl r2, #1
++ vldm r1, {q10-q13}
++ vld2.16 {d0, d2}, [r0 :128], r2
++ vld2.16 {d1, d3}, [ip :128], r2
++ vld2.16 {d4, d6}, [r0 :128]
++ vld2.16 {d5, d7}, [ip :128]
++
++ sub r0, r2
++ vqadd.s16 q0, q10
++ sub ip, r2
++ vqadd.s16 q1, q12
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++
++ vst2.16 {d0, d2}, [r0 :128], r2
++ vst2.16 {d1, d3}, [ip :128], r2
++ vst2.16 {d4, d6}, [r0 :128]
++ vst2.16 {d5, d7}, [ip :128]
++ bx lr
++endfunc
++
++@ add_residual8x8_c(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
++ push {lr}
++ add ip, r0, r2
++ lsl r2, #1
++ vmov.i16 q8, #0
++ add r3, r1, #(8*8*2) @ Offset to V
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ mov lr, #8
++1:
++ vld1.16 {q10, q11}, [r1 :256]!
++ subs lr, #2
++ vld2.16 {q0, q1}, [r0 :256]
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q12, q13}, [r3 :256]!
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q12
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ pop {pc}
++endfunc
++
++@ add_residual16x16_c(
++@ uint16_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
++ push {r4, lr}
++ vmov.i16 q8, #0
++ add r3, r1, #(16*16*2) @ Offset to V
++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
++ add ip, r0, #32
++ add r4, r0, r2
++ mov lr, #16
++1:
++ vld2.16 {q0, q1}, [r0 :256]
++ vld2.16 {q2, q3}, [ip :256]
++ vld1.16 {q10, q11}, [r1 :256]!
++ vld1.16 {q12, q13}, [r3 :256]!
++ vqadd.s16 q0, q10
++ pldw [r4]
++ vqadd.s16 q1, q12
++ add r4, r2
++ vqadd.s16 q2, q11
++ subs lr, #1
++ vqadd.s16 q3, q13
++ clip16_4 q0, q1, q2, q3, q8, q9
++ vst2.16 {q0, q1}, [r0 :256], r2
++ vst2.16 {q2, q3}, [ip :256], r2
++ bne 1b
++ pop {r4,pc}
++endfunc
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
+@@ -0,0 +1,741 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++ .arch_extension mp @ enable PLDW
++
++@ General notes:
++@
++@ Residual is generally only guaranteed to be clipped to 16 bits.
++@ This means that we do need to do vmovl, vqadd, vqmovun
++@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
++@ with this).
++@
++@ There is an exception for the DC case because its transform is guaranteed
++@ to be small enough that overflow cannot occur during the first add.
++
++@ ============================================================================
++@ Y add
++
++function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
++ add ip, r0, r2
++ vld1.16 {q0, q1}, [r1]
++ lsl r2, #1
++ vld1.32 d4[0], [r0], r2
++ rsb r3, r2, #0
++ vld1.32 d4[1], [ip], r2
++ vld1.32 d5[0], [r0], r3
++ vld1.32 d5[1], [ip], r3
++ vmovl.u8 q8, d4
++ vmovl.u8 q9, d5
++ vqadd.s16 q0, q8
++ vqadd.s16 q1, q9
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q1
++ vst1.32 d0[0], [r0], r2
++ vst1.32 d0[1], [ip], r2
++ vst1.32 d1[0], [r0]
++ vst1.32 d1[1], [ip]
++ bx lr
++endfunc
++
++function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
++ push {r4, lr}
++ vld1.16 {q0, q1}, [r1]!
++ add ip, r0, r2
++ vld1.8 {d6}, [r0]
++ add r4, r0, r2, lsl #1
++ vld1.8 {d7}, [ip]
++ add lr, ip, r2, lsl #1
++ lsl r2, #1
++ mov r3, #8-2
++ vmovl.u8 q2, d6
++ vmovl.u8 q3, d7
++ vqadd.s16 q2, q0
++ vqadd.s16 q3, q1
++1:
++ vld1.16 {q0, q1}, [r1]!
++ subs r3, #2
++ vqmovun.s16 d4, q2
++ vqmovun.s16 d5, q3
++ vld1.8 {d6}, [r4], r2
++ vld1.8 {d7}, [lr], r2
++ vst1.8 {d4}, [r0], r2
++ vst1.8 {d5}, [ip], r2
++ vmovl.u8 q2, d6
++ pldw [r4]
++ vmovl.u8 q3, d7
++ vqadd.s16 q2, q0
++ vqadd.s16 q3, q1
++ bne 1b
++
++ vqmovun.s16 d4, q2
++ vqmovun.s16 d5, q3
++ vst1.8 {d4}, [r0]
++ vst1.8 {d5}, [ip]
++ pop {r4, pc}
++endfunc
++
++function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
++ vld1.16 {q0, q1}, [r1]!
++ add ip, r0, r2
++ vld1.8 {q3}, [r0]
++ mov r3, #16-1
++ vmovl.u8 q2, d6
++ vmovl.u8 q3, d7
++ vqadd.s16 q2, q0
++ vqadd.s16 q3, q1
++1:
++ vld1.16 {q0, q1}, [r1]!
++ subs r3, #1
++ vqmovun.s16 d4, q2
++ vqmovun.s16 d5, q3
++ vld1.8 {q3}, [ip], r2
++ vst1.8 {q2}, [r0], r2
++ vmovl.u8 q2, d6
++ pldw [ip]
++ vmovl.u8 q3, d7
++ vqadd.s16 q2, q0
++ vqadd.s16 q3, q1
++ bne 1b
++
++ vqmovun.s16 d4, q2
++ vqmovun.s16 d5, q3
++ vst1.8 {q2}, [r0]
++ bx lr
++endfunc
++
++function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
++ vldm r1!, {q0-q3}
++ vld1.8 {q8, q9}, [r0]
++ add ip, r0, r2
++ vmovl.u8 q10, d16
++ mov r3, #32-1
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vqadd.s16 q10, q0
++ vqadd.s16 q11, q1
++ vqadd.s16 q12, q2
++ vqadd.s16 q13, q3
++1:
++ vldm r1!, {q0-q3}
++ vqmovun.s16 d20, q10
++ vqmovun.s16 d21, q11
++ vqmovun.s16 d22, q12
++ vqmovun.s16 d23, q13
++ vld1.8 {q8, q9}, [ip], r2
++ subs r3, #1
++ vst1.8 {q10, q11}, [r0], r2
++ vmovl.u8 q10, d16
++ pldw [ip]
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vqadd.s16 q10, q0
++ vqadd.s16 q11, q1
++ vqadd.s16 q12, q2
++ vqadd.s16 q13, q3
++ bne 1b
++
++ vqmovun.s16 d20, q10
++ vqmovun.s16 d21, q11
++ vqmovun.s16 d22, q12
++ vqmovun.s16 d23, q13
++ vst1.8 {q10, q11}, [r0]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
++ add ip, r0, r1
++ vdup.16 q15, r2
++ lsl r1, #1
++ vld1.32 d4[0], [r0], r1
++ rsb r3, r1, #0
++ vld1.32 d4[1], [ip], r1
++ vld1.32 d5[0], [r0], r3
++ vld1.32 d5[1], [ip], r3
++ vaddw.u8 q0, q15, d4
++ vaddw.u8 q1, q15, d5
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q1
++ vst1.32 d0[0], [r0], r1
++ vst1.32 d0[1], [ip], r1
++ vst1.32 d1[0], [r0]
++ vst1.32 d1[1], [ip]
++ bx lr
++endfunc
++
++@ ============================================================================
++@ DC Y or C add
++
++@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
++ mov r3, #4-2
++ vdup.32 q15, r2
++ b 1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
++ vdup.16 q15, r2
++ mov r3, #8-2
++1: vld1.8 d16, [r0]
++ add ip, r0, r1
++ push {r4, lr}
++ vld1.8 d17, [ip]
++ add r4, r0, r1, lsl #1
++ vaddw.u8 q0, q15, d16
++ lsl r1, #1
++ vaddw.u8 q1, q15, d17
++ add lr, ip, r1
++1:
++ vld1.8 {d16}, [r4], r1
++ vld1.8 {d17}, [lr], r1
++ subs r3, #2
++ vqmovun.s16 d4, q0
++ vqmovun.s16 d5, q1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++ vst1.8 {d4}, [r0], r1
++ vst1.8 {d5}, [ip], r1
++ bne 1b
++
++ vqmovun.s16 d4, q0
++ vqmovun.s16 d5, q1
++ vst1.8 {d4}, [r0]
++ vst1.8 {d5}, [ip]
++ pop {r4, pc}
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
++ mov r3, #8-1
++ vdup.32 q15, r2
++ b 1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
++ vdup.16 q15, r2
++ mov r3, #16-1
++1: vld1.8 {q8}, [r0]
++ add ip, r0, r1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++1:
++ vld1.8 {q8}, [ip], r1
++ subs r3, #1
++ vqmovun.s16 d4, q0
++ vqmovun.s16 d5, q1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++ vst1.8 {q2}, [r0], r1
++ bne 1b
++
++ vqmovun.s16 d4, q0
++ vqmovun.s16 d5, q1
++ vst1.8 {q2}, [r0]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
++ mov r3, #16-1
++ vdup.32 q15, r2
++ b 1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
++@ uint8_t * dst, // [r0]
++@ unsigned int stride, // [r1]
++@ int dc) // [r2]
++
++function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
++ vdup.16 q15, r2
++ mov r3, #32-1
++1: vld1.8 {q8, q9}, [r0]
++ add ip, r0, r1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++ vaddw.u8 q2, q15, d18
++ vaddw.u8 q3, q15, d19
++1:
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d22, q2
++ vqmovun.s16 d23, q3
++ vld1.8 {q8, q9}, [ip], r1
++ subs r3, #1
++ vaddw.u8 q0, q15, d16
++ vaddw.u8 q1, q15, d17
++ vaddw.u8 q2, q15, d18
++ vaddw.u8 q3, q15, d19
++ vst1.8 {q10, q11}, [r0], r1
++ bne 1b
++
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d22, q2
++ vqmovun.s16 d23, q3
++ vst1.8 {q10, q11}, [r0]
++ bx lr
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride, [r2]
++@ int dc_v) [r3]
++
++function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
++ add ip, r0, r2
++ vld1.16 {q0, q1}, [r1]
++ lsl r2, #1
++ vld1.8 {d16}, [r0 :64], r2
++ vld1.8 {d17}, [ip :64], r2
++ vld1.8 {d18}, [r0 :64]
++ sub r0, r2
++ vld1.8 {d19}, [ip :64]
++ sub ip, r2
++ vdup.16 q2, r3
++ vdup.16 q3, r3
++ vmovl.u8 q10, d16
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vzip.16 q0, q2
++ vzip.16 q1, q3
++ vqadd.s16 q0, q10
++ vqadd.s16 q2, q11
++ vqadd.s16 q1, q12
++ vqadd.s16 q3, q13
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q2
++ vqmovun.s16 d2, q1
++ vqmovun.s16 d3, q3
++ vst1.8 {d0}, [r0 :64], r2
++ vst1.8 {d1}, [ip :64], r2
++ vst1.8 {d2}, [r0 :64]
++ vst1.8 {d3}, [ip :64]
++ bx lr
++endfunc
++
++@ add_residual8x8_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++@ int dc_v) [r3]
++
++function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ push {r4, lr}
++ vld2.8 {d16, d17}, [r0 :128]
++ lsl r2, #1
++ vld2.8 {d18, d19}, [ip :128]
++ mov r3, #8-2
++ vld1.16 {q0, q1}, [r1 :256]!
++ add r4, r0, r2
++ vmovl.u8 q10, d16
++ add lr, ip, r2
++ vmovl.u8 q11, d18
++ vqadd.s16 q0, q10
++ vaddw.u8 q2, q15, d17
++ vqadd.s16 q1, q11
++ vaddw.u8 q3, q15, d19
++1:
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d21, q2
++ vld2.8 {d16, d17}, [r4 :128], r2
++ subs r3, #2
++ vqmovun.s16 d22, q1
++ vqmovun.s16 d23, q3
++ vst2.8 {d20, d21}, [r0 :128], r2
++ vld2.8 {d18, d19}, [lr :128], r2
++ vst2.8 {d22, d23}, [ip :128], r2
++ vld1.16 {q0, q1}, [r1 :256]!
++ vmovl.u8 q10, d16
++ vmovl.u8 q11, d18
++ vqadd.s16 q0, q10
++ vaddw.u8 q2, q15, d17
++ vqadd.s16 q1, q11
++ vaddw.u8 q3, q15, d19
++ bne 1b
++
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d21, q2
++ vqmovun.s16 d22, q1
++ vqmovun.s16 d23, q3
++ vst2.8 {d20, d21}, [r0 :128]
++ vst2.8 {d22, d23}, [ip :128]
++ pop {r4, pc}
++endfunc
++
++@ add_residual16x16_u(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++@ int dc_v) [r3]
++
++function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ vld2.8 {q8, q9}, [r0 :256]
++ mov r3, #16-1
++ vld1.16 {q0, q1}, [r1 :256]!
++ vmovl.u8 q11, d16
++ vmovl.u8 q12, d17
++ vqadd.s16 q0, q11
++ vaddw.u8 q11, q15, d18
++ vqadd.s16 q1, q12
++ vaddw.u8 q12, q15, d19
++1:
++ vld2.8 {q8, q9}, [ip :256], r2
++ subs r3, #1
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d22, q11
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d23, q12
++ vld1.16 {q0, q1}, [r1 :256]!
++ vst2.8 {q10, q11}, [r0 :256], r2
++ vmovl.u8 q11, d16
++ pldw [ip]
++ vmovl.u8 q12, d17
++ vqadd.s16 q0, q11
++ vaddw.u8 q11, q15, d18
++ vqadd.s16 q1, q12
++ vaddw.u8 q12, q15, d19
++ bne 1b
++
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d22, q11
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d23, q12
++ vst2.8 {q10, q11}, [r0 :256]
++ bx lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
++ add ip, r0, r2
++ vld1.16 {q2, q3}, [r1]
++ lsl r2, #1
++ vld1.8 {d16}, [r0 :64], r2
++ vld1.8 {d17}, [ip :64], r2
++ vld1.8 {d18}, [r0 :64]
++ sub r0, r2
++ vld1.8 {d19}, [ip :64]
++ sub ip, r2
++ vdup.16 q0, r3
++ vdup.16 q1, r3
++ vmovl.u8 q10, d16
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vzip.16 q0, q2
++ vzip.16 q1, q3
++ vqadd.s16 q0, q10
++ vqadd.s16 q2, q11
++ vqadd.s16 q1, q12
++ vqadd.s16 q3, q13
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q2
++ vqmovun.s16 d2, q1
++ vqmovun.s16 d3, q3
++ vst1.8 {d0}, [r0 :64], r2
++ vst1.8 {d1}, [ip :64], r2
++ vst1.8 {d2}, [r0 :64]
++ vst1.8 {d3}, [ip :64]
++ bx lr
++endfunc
++
++@ add_residual8x8_v(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ push {r4, lr}
++ vld2.8 {d16, d17}, [r0 :128]
++ lsl r2, #1
++ vld2.8 {d18, d19}, [ip :128]
++ mov r3, #8-2
++ vld1.16 {q0, q1}, [r1 :256]!
++ add r4, r0, r2
++ vmovl.u8 q10, d17
++ add lr, ip, r2
++ vmovl.u8 q11, d19
++ vqadd.s16 q0, q10
++ vaddw.u8 q2, q15, d16
++ vqadd.s16 q1, q11
++ vaddw.u8 q3, q15, d18
++1:
++ vqmovun.s16 d20, q2
++ vqmovun.s16 d21, q0
++ vld2.8 {d16, d17}, [r4 :128], r2
++ subs r3, #2
++ vqmovun.s16 d22, q3
++ vqmovun.s16 d23, q1
++ vst2.8 {d20, d21}, [r0 :128], r2
++ vld2.8 {d18, d19}, [lr :128], r2
++ vst2.8 {d22, d23}, [ip :128], r2
++ vld1.16 {q0, q1}, [r1 :256]!
++ vmovl.u8 q10, d17
++ vmovl.u8 q11, d19
++ vqadd.s16 q0, q10
++ vaddw.u8 q2, q15, d16
++ vqadd.s16 q1, q11
++ vaddw.u8 q3, q15, d18
++ bne 1b
++
++ vqmovun.s16 d20, q2
++ vqmovun.s16 d21, q0
++ vqmovun.s16 d22, q3
++ vqmovun.s16 d23, q1
++ vst2.8 {d20, d21}, [r0 :128]
++ vst2.8 {d22, d23}, [ip :128]
++ pop {r4, pc}
++endfunc
++
++@ add_residual16x16_v(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
++ vdup.16 q15, r3
++ add ip, r0, r2
++ vld2.8 {q8, q9}, [r0 :256]
++ mov r3, #16-1
++ vld1.16 {q0, q1}, [r1 :256]!
++ vmovl.u8 q11, d18
++ vmovl.u8 q12, d19
++ vqadd.s16 q0, q11
++ vaddw.u8 q11, q15, d16
++ vqadd.s16 q1, q12
++ vaddw.u8 q12, q15, d17
++1:
++ vld2.8 {q8, q9}, [ip :256], r2
++ subs r3, #1
++ vqmovun.s16 d20, q11
++ vqmovun.s16 d22, q0
++ vqmovun.s16 d21, q12
++ vqmovun.s16 d23, q1
++ vld1.16 {q0, q1}, [r1 :256]!
++ vst2.8 {q10, q11}, [r0 :256], r2
++ vmovl.u8 q11, d18
++ pldw [ip]
++ vmovl.u8 q12, d19
++ vqadd.s16 q0, q11
++ vaddw.u8 q11, q15, d16
++ vqadd.s16 q1, q12
++ vaddw.u8 q12, q15, d17
++ bne 1b
++
++ vqmovun.s16 d20, q11
++ vqmovun.s16 d22, q0
++ vqmovun.s16 d21, q12
++ vqmovun.s16 d23, q1
++ vst2.8 {q10, q11}, [r0 :256]
++ bx lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
++ add ip, r0, r2
++ vld1.16 {q0, q1}, [r1]! @ all of U
++ lsl r2, #1
++ vld1.8 {d16}, [r0 :64], r2
++ rsb r3, r2, #0
++ vld1.8 {d17}, [ip :64], r2
++ vld1.16 {q2, q3}, [r1] @ all of V
++ vld1.8 {d18}, [r0 :64], r3
++ vld1.8 {d19}, [ip :64], r3
++ vmovl.u8 q10, d16
++ vmovl.u8 q11, d17
++ vmovl.u8 q12, d18
++ vmovl.u8 q13, d19
++ vzip.16 q0, q2
++ vzip.16 q1, q3
++ vqadd.s16 q0, q10
++ vqadd.s16 q2, q11
++ vqadd.s16 q1, q12
++ vqadd.s16 q3, q13
++ vqmovun.s16 d0, q0
++ vqmovun.s16 d1, q2
++ vqmovun.s16 d2, q1
++ vqmovun.s16 d3, q3
++ vst1.8 {d0}, [r0 :64], r2
++ vst1.8 {d1}, [ip :64], r2
++ vst1.8 {d2}, [r0 :64]
++ vst1.8 {d3}, [ip :64]
++ bx lr
++endfunc
++
++@ add_residual8x8_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
++ vld2.8 {d16, d17}, [r0 :128]
++ add r3, r1, #(8*8*2) @ Offset to V
++ vld1.16 {q0}, [r1 :128]!
++ add ip, r0, r2
++ vld1.16 {q1}, [r3 :128]!
++ vmovl.u8 q10, d16
++ push {lr}
++ vmovl.u8 q8, d17
++ mov lr, #8-1
++ vqadd.s16 q10, q0
++ vqadd.s16 q1, q8
++1:
++ vld2.8 {d16, d17}, [ip :128], r2
++ subs lr, #1
++ vld1.16 {q0}, [r1 :128]!
++ vqmovun.s16 d20, q10
++ vqmovun.s16 d21, q1
++ vld1.16 {q1}, [r3 :128]!
++ vst2.8 {d20, d21}, [r0 :128], r2
++ vmovl.u8 q10, d16
++ pldw [ip]
++ vmovl.u8 q8, d17
++ vqadd.s16 q10, q0
++ vqadd.s16 q1, q8
++ bne 1b
++
++ vqmovun.s16 d20, q10
++ vqmovun.s16 d21, q1
++ vst2.8 {d20, d21}, [r0 :128]
++ pop {pc}
++endfunc
++
++@ add_residual16x16_c(
++@ uint8_t *_dst, [r0]
++@ const int16_t *res, [r1]
++@ ptrdiff_t stride) [r2]
++
++function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
++ vld2.8 {q8, q9}, [r0 :256]
++ add r3, r1, #(16*16*2) @ Offset to V
++ vld1.16 {q0, q1}, [r1 :256]!
++ add ip, r0, r2
++ vld1.16 {q2, q3}, [r3 :256]!
++ vmovl.u8 q10, d16
++ push {lr}
++ vmovl.u8 q8, d17
++ mov lr, #16-1
++ vmovl.u8 q11, d18
++ vmovl.u8 q9, d19
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q8
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q9
++1:
++ vld2.8 {q8, q9}, [ip :256], r2
++ subs lr, #1
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d22, q2
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d23, q3
++ vld1.16 {q0, q1}, [r1 :256]!
++ vst2.8 {d20-d23}, [r0 :256], r2
++ vld1.16 {q2, q3}, [r3 :256]!
++ vmovl.u8 q10, d16
++ pldw [ip]
++ vmovl.u8 q8, d17
++ vmovl.u8 q11, d18
++ vmovl.u8 q9, d19
++ vqadd.s16 q0, q10
++ vqadd.s16 q1, q8
++ vqadd.s16 q2, q11
++ vqadd.s16 q3, q9
++ bne 1b
++
++ vqmovun.s16 d20, q0
++ vqmovun.s16 d22, q2
++ vqmovun.s16 d21, q1
++ vqmovun.s16 d23, q3
++ vst2.8 {d20-d23}, [r0 :256]
++ pop {pc}
++endfunc
++
++@ 32x32 chroma never occurs so NIF
++
++@ ============================================================================
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
+@@ -0,0 +1,2245 @@
++/*
++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * 2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.set EDGE_SRC_STRIDE, 160
++
++@ PIC jump tables are fractionally more expensive than absolute in our code
++.set jent_pic, CONFIG_PIC
++
++
++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
++ vshr.u8 q12, q8, #3
++ \I1
++ vadd.i8 q8, \Q_K128
++ \I2
++ vshr.u8 q13, q9, #3
++ \I3
++ vadd.i8 q9, \Q_K128
++ \I4
++ vtbl.8 d24, \XLAT0, d24
++ vtbl.8 d25, \XLAT0, d25
++ vtbl.8 d26, \XLAT1, d26
++ vtbl.8 d27, \XLAT1, d27
++
++ vqadd.s8 q8, q12
++ vshr.u8 q12, q10, #3
++ vadd.i8 q10, \Q_K128
++ vqadd.s8 q9, q13
++ vshr.u8 q13, q11, #3
++ vadd.i8 q11, \Q_K128
++
++ vtbl.8 d24, \XLAT0, d24
++ vtbl.8 d25, \XLAT0, d25
++ vtbl.8 d26, \XLAT1, d26
++ vtbl.8 d27, \XLAT1, d27
++ vqadd.s8 q10, q12
++ vsub.i8 q8, \Q_K128
++ vqadd.s8 q11, q13
++ vsub.i8 q9, \Q_K128
++ vsub.i8 q10, \Q_K128
++ vsub.i8 q11, \Q_K128
++.endm
++
++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
++ \L1
++ \L2
++ \L3
++ \L4
++ \L5
++ vadd.i8 q12, q8, \Q_K128
++ vshr.u8 q8, #3
++ vtbl.8 d16, \XLAT0, d16
++ vtbl.8 d17, \XLAT1, d17
++ vqadd.s8 q12, q8
++ bmi 2f
++1: \L1
++ \L2
++ \L3
++ \L4
++ \L5
++ vsub.i8 q13, q12, \Q_K128
++ vadd.i8 q12, q8, \Q_K128
++ vshr.u8 q8, #3
++ \S1
++ \S2
++ \S3
++ \S4
++ vtbl.8 d16, \XLAT0, d16
++ vtbl.8 d17, \XLAT1, d17
++ vqadd.s8 q12, q8
++ bpl 1b
++2: vsub.i8 q13, q12, \Q_K128
++ \S1
++ \S2
++ \S3
++ \S4
++.endm
++
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++ vmax.s16 \Q0, \Q_MIN
++ vmax.s16 \Q1, \Q_MIN
++ vmax.s16 \Q2, \Q_MIN
++ vmax.s16 \Q3, \Q_MIN
++ vmin.s16 \Q0, \Q_MAX
++ vmin.s16 \Q1, \Q_MAX
++ vmin.s16 \Q2, \Q_MAX
++ vmin.s16 \Q3, \Q_MAX
++.endm
++
++@ Clobbers q12, q13
++.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
++ vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++ vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++ vshrn.i16 d26, \Q2, #(\bit_depth - 5)
++ \I1
++ vtbl.8 d24, \XLAT0, d24
++ vshrn.i16 d27, \Q3, #(\bit_depth - 5)
++ vtbl.8 d25, \XLAT1, d25
++ \I2
++ vtbl.8 d26, \XLAT0, d26
++ vtbl.8 d27, \XLAT1, d27
++ vaddw.s8 \Q0, d24
++ vaddw.s8 \Q1, d25
++ vaddw.s8 \Q2, d26
++ vaddw.s8 \Q3, d27
++ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
++.endm
++
++@ Clobbers q10, q11, q12
++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
++ \L1
++ \L2
++ \L3
++ \L4
++ \L5
++ vshrn.i16 d24, \Q0, #\bit_depth - 5
++ vshrn.i16 d25, \Q1, #\bit_depth - 5
++ vtbl.8 d24, \XLAT0, d24
++ vtbl.8 d25, \XLAT1, d25
++ vaddw.s8 q10, \Q0, d24
++ vaddw.s8 q11, \Q1, d25
++ bmi 2f
++1: \L1
++ \L2
++ \L3
++ \L4
++ \L5
++ vmax.s16 q10, \Q_MIN
++ vmax.s16 q11, \Q_MIN
++ vshrn.i16 d24, \Q0, #\bit_depth - 5
++ vshrn.i16 d25, \Q1, #\bit_depth - 5
++ vmin.s16 q10, \Q_MAX
++ vmin.s16 q11, \Q_MAX
++ \S1
++ \S2
++ \S3
++ \S4
++ vtbl.8 d24, \XLAT0, d24
++ vtbl.8 d25, \XLAT1, d25
++ vaddw.s8 q10, \Q0, d24
++ vaddw.s8 q11, \Q1, d25
++ bpl 1b
++2: vmax.s16 q10, \Q_MIN
++ vmax.s16 q11, \Q_MIN
++ vmin.s16 q10, \Q_MAX
++ vmin.s16 q11, \Q_MAX
++ \S1
++ \S2
++ \S3
++ \S4
++.endm
++
++
++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
++@ so we are quite safe stuffing it into a byte array
++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
++@ precision
++
++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
++@ array via the stack
++@ Given that sao_left_class > 28 can cause wrap we can't just poke
++@ all 4 bytes in at once
++@
++@ It also loads other common regs
++
++@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
++function band_load_y
++ ldr ip, [sp, #16] @ &sao_offset_val[0]
++ ldr r4, [sp, #20] @ sao_left_class
++ vmov.i64 d4, #0
++ vmov.i64 q0, #0
++ pld [r1]
++ vld2.8 {q8}, [ip]
++ sub ip, sp, #8*5
++ vmov.i64 q1, #0
++ add r4, ip, r4
++ vpush {d0-d4} @ Put zero array on stack
++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1]
++ ldr ip, [ip, #8*5 + 28] @ height
++ vst1.32 {d16[0]}, [r4]
++ add r4, r1, r3
++ vpop {d0-d4} @ Pop modified array
++ sub ip, ip, #1
++ vorr d0, d0, d4
++ bx lr
++endfunc
++
++@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
++function band_load_c
++ ldr ip, [sp, #16] @ &sao_offset_val1[0]
++ ldr r4, [sp, #20] @ sao_left_class1
++ vmov.i64 d24, #0
++ vmov.i64 q10, #0
++ pld [r1]
++ vld2.8 {q8}, [ip]
++ sub ip, sp, #8*5
++ vmov.i64 q11, #0
++ add r4, ip, r4
++ ldr ip, [sp, #24] @ &sao_offset_val2[0]
++ vpush {d20-d24} @ Put zero array on stack
++ vld2.8 {q9}, [ip]
++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1]
++ ldr ip, [sp, #8*5 + 28] @ sao_left_class2
++ vst1.32 {d16[0]}, [r4]
++ add ip, sp, ip
++ vshr.u64 d18, d18, #8 @ 1st interesting val is [1]
++ vldmia sp, {d0-d3} @ Load modified array
++ vldr d16, [sp, #8*4]
++ add r4, r1, r3
++ vstmia sp, {d20-d24} @ Put zero array on stack (again)
++ vst1.32 {d18[0]}, [ip]
++ vorr d0, d0, d16
++ vldmia sp, {d4-d7} @ Load modified array
++ vldr d18, [sp, #8*4]
++ ldr ip, [sp, #8*5 + 36] @ height
++ add sp, sp, #8*5
++ vorr d4, d4, d18
++ sub ip, ip, #1
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_64_neon_8 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_band_64_neon_8, export=1
++ push {r4-r6, lr}
++ vmov.u8 q15, #128
++ bl band_load_y
++
++1: vldmia r1, {q8-q11}
++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
++ "pld [r4]", \
++ "subs ip, #1", \
++ "it ne; addne r4, r3", \
++ "add r1, r3"
++ vstmia r0, {q8-q11}
++ add r0, r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_32_neon_8 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_band_32_neon_8, export=1
++ push {r4-r6, lr}
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ vmov.u8 q15, #128
++ bl band_load_y
++
++1: vld1.8 { q8, q9 }, [r1, :128], r3
++ subs ip, #2
++ vld1.8 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15
++
++ vst1.8 { q8, q9 }, [r0, :128], r2
++ vst1.8 {q10, q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_16_neon_8 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_band_16_neon_8, export=1
++ push {r4-r6, lr}
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ vmov.u8 q15, #128
++ bl band_load_y
++
++1: vld1.8 { q8}, [r1, :128], r3
++ subs ip, #4
++ vld1.8 { q9}, [r6, :128], r3
++ vld1.8 {q10}, [r1, :128], r3
++ vld1.8 {q11}, [r6, :128], r3
++
++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15
++
++ vst1.8 { q8}, [r0, :128], r2
++ vst1.8 { q9}, [r5, :128], r2
++ vst1.8 {q10}, [r0, :128], r2
++ vst1.8 {q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_8_neon_8 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_band_8_neon_8, export=1
++ ldr ip, [sp, #8] @ width
++ push {r4-r6, lr}
++ vmov.u8 q15, #128
++ cmp ip, #8
++ bl band_load_y
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ blt 4f
++
++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
++ "vld1.8 {d16}, [r1, :64], r3", \
++ "subs ip, #2", \
++ "vld1.8 {d17}, [r6, :64], r3", \
++ "", \
++ "", \
++ "vst1.8 {d26}, [r0, :64], r2", \
++ "vst1.8 {d27}, [r5, :64], r2"
++ pop {r4-r6, pc}
++4:
++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
++ "vld1.32 {d16[0]}, [r1, :32], r3", \
++ "subs ip, #4", \
++ "vld1.32 {d16[1]}, [r6, :32], r3", \
++ "vld1.32 {d17[0]}, [r1, :32], r3", \
++ "vld1.32 {d17[1]}, [r6, :32], r3", \
++ "vst1.32 {d26[0]}, [r0, :32], r2", \
++ "vst1.32 {d26[1]}, [r5, :32], r2", \
++ "vst1.32 {d27[0]}, [r0, :32], r2", \
++ "vst1.32 {d27[1]}, [r5, :32], r2"
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_32_neon_8(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
++ push {r4-r6, lr}
++ add r5, r0, #32
++ add r6, r1, #32
++ vmov.u8 q15, #128
++ bl band_load_c
++
++1: vld2.8 { q8, q9 }, [r1, :128], r3
++ subs ip, #1
++ vld2.8 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
++ "pld [r4]", \
++ "it ne; addne r4, r3"
++
++ vst2.8 { q8, q9 }, [r0, :128], r2
++ vst2.8 {q10, q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_16_neon_8(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
++ push {r4-r6, lr}
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ vmov.u8 q15, #128
++ bl band_load_c
++
++1: vld2.8 { q8, q9 }, [r1, :128], r3
++ subs ip, #2
++ vld2.8 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15
++
++ vst2.8 { q8, q9 }, [r0, :128], r2
++ vst2.8 {q10, q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_8_neon_8(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
++ ldr ip, [sp, #16] @ width
++ push {r4-r6, lr}
++ vmov.u8 q15, #128
++ cmp ip, #8
++ bl band_load_c
++ blt 4f
++
++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
++ "vld2.8 {d16-d17}, [r1, :128], r3", \
++ "subs ip, #1", \
++ "", \
++ "", \
++ "", \
++ "vst2.8 {d26-d27}, [r0, :128], r2"
++ pop {r4-r6, pc}
++4:
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
++ "vld1.8 {d16}, [r1, :64], r3", \
++ "subs ip, #2", \
++ "vld1.8 {d17}, [r6, :64], r3", \
++ "vuzp.8 d16, d17", \
++ "", \
++ "vzip.8 d26, d27", \
++ "vst1.8 {d26}, [r0, :64], r2", \
++ "vst1.8 {d27}, [r5, :64], r2"
++ pop {r4-r6, pc}
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_64_neon_10 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++.macro band_64_16 bit_depth
++ push {r4-r6, lr}
++ vmov.i64 q2, #0
++ vmov.i16 q3, #(1 << \bit_depth) - 1
++ bl band_load_y
++ vpush {q4-q7}
++
++1: vldm r1, {q4-q11}
++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
++ "subs ip, #1", \
++ "add r1, r3"
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
++ vstm r0, {q4-q11}
++ add r0, r2
++ bpl 1b
++
++ vpop {q4-q7}
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_64_neon_10, export=1
++ band_64_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_32_neon_10 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++.macro band_32_16 bit_depth
++ push {r4-r6, lr}
++ vmov.i64 q2, #0
++ vmov.i16 q3, #(1 << \bit_depth) - 1
++ bl band_load_y
++
++1: vldm r1, {q8-q11}
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
++ "subs ip, #1", \
++ "add r1, r3"
++ vstm r0, {q8-q11}
++ add r0, r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_32_neon_10, export=1
++ band_32_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_16_neon_10 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++.macro band_16_16 bit_depth
++ push {r4-r6, lr}
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ vmov.i64 q14, #0
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_y
++
++1: vld1.16 { q8, q9 }, [r1, :128], r3
++ subs r12, #2
++ vld1.16 {q10, q11}, [r6, :128], r3
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
++ vst1.16 { q8, q9 }, [r0, :128], r2
++ vst1.16 {q10, q11}, [r5, :128], r2
++ bpl 1b
++
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_16_neon_10, export=1
++ band_16_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_8_neon_10 (
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ ptrdiff_t stride_src, [r3]
++@ int16_t *sao_offset_val, [sp, #0]
++@ int sao_left_class, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++.macro band_8_16 bit_depth
++ ldr ip, [sp, #8] @ width
++ push {r4-r6, lr}
++ vmov.i64 q14, #0
++ cmp ip, #8
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_y
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ blt 4f
++
++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
++ "vld1.16 {q8}, [r1, :128], r3", \
++ "subs ip, #2", \
++ "vld1.16 {q9}, [r6, :128], r3", \
++ "", \
++ "", \
++ "vst1.16 {q10}, [r0, :128], r2", \
++ "vst1.16 {q11}, [r5, :128], r2"
++ pop {r4-r6, pc}
++4:
++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
++ "vld1.16 {d16}, [r1, :64], r3", \
++ "subs ip, #4", \
++ "vld1.16 {d17}, [r6, :64], r3", \
++ "vld1.16 {d18}, [r1, :64], r3", \
++ "vld1.16 {d19}, [r6, :64], r3", \
++ "vst1.16 {d20}, [r0, :64], r2", \
++ "vst1.16 {d21}, [r5, :64], r2", \
++ "vst1.16 {d22}, [r0, :64], r2", \
++ "vst1.16 {d23}, [r5, :64], r2"
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_8_neon_10, export=1
++ band_8_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_32_neon_10(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++.macro band_c_32_16 bit_depth
++ push {r4-r6, lr}
++ add r5, r0, #32
++ add r6, r1, #32
++ sub r2, #64
++ sub r3, #64
++ vmov.i64 q14, #0
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_c
++ mov lr, #64
++ vpush {q4-q7}
++
++1: vld2.16 { q4, q5 }, [r1, :128], lr
++ subs ip, #1
++ vld2.16 { q6, q7 }, [r6, :128], lr
++ vld2.16 { q8, q9 }, [r1, :128], r3
++ vld2.16 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++ "pld [r4]", \
++ "it ne; addne r4, r3"
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++
++ vst2.16 { q4, q5 }, [r0, :128], lr
++ vst2.16 { q6, q7 }, [r5, :128], lr
++ vst2.16 { q8, q9 }, [r0, :128], r2
++ vst2.16 {q10, q11}, [r5, :128], r2
++
++ bpl 1b
++
++ vpop {q4-q7}
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
++ band_c_32_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_16_neon_10(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++.macro band_c_16_16 bit_depth
++ push {r4-r6, lr}
++ add r5, r0, #32
++ add r6, r1, #32
++ vmov.i64 q14, #0
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_c
++
++1: vld2.16 { q8, q9 }, [r1, :128], r3
++ subs ip, #1
++ vld2.16 {q10, q11}, [r6, :128], r3
++
++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++
++ vst2.16 { q8, q9 }, [r0, :128], r2
++ vst2.16 {q10, q11}, [r5, :128], r2
++
++ bpl 1b
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
++ band_c_16_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_8_neon_10(
++@ uint8_t * dst [r0]
++@ uint8_t * src [r1]
++@ uint32_t dst_stride [r2]
++@ uint32_t src_stride [r3]
++@ const int16_t * table1 sp[0]
++@ uint32_t offset1 sp[4]
++@ const int16_t * table2 sp[8]
++@ uint32_t offset2 sp[12]
++@ int width sp[16]
++@ int height sp[20]
++
++.macro band_c_8_16 bit_depth
++ ldr ip, [sp, #16] @ width
++ push {r4-r6, lr}
++ vmov.i64 q14, #0
++ cmp ip, #8
++ vmov.i16 q15, #(1 << \bit_depth) - 1
++ bl band_load_c
++ blt 4f
++
++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++ "vld2.16 {q8,q9}, [r1, :128], r3", \
++ "subs ip, #1", \
++ "", \
++ "", \
++ "", \
++ "vst2.16 {q10,q11}, [r0, :128], r2"
++ pop {r4-r6, pc}
++4:
++ add r5, r0, r2
++ add r6, r1, r3
++ lsl r2, #1
++ lsl r3, #1
++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++ "vld2.16 {d16,d18}, [r1, :128], r3", \
++ "subs ip, #2", \
++ "vld2.16 {d17,d19}, [r6, :128], r3", \
++ "", \
++ "", \
++ "vst2.16 {d20,d22}, [r0, :128], r2", \
++ "vst2.16 {d21,d23}, [r5, :128], r2"
++ pop {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
++ band_c_8_16 10
++endfunc
++
++
++@ =============================================================================
++@ SAO EDGE
++
++@ r0 destination address
++@ r2 stride to post-increment r0 with
++@ [r5] translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27. For Y d26=d27
++
++function edge_64b_body_8
++
++ vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0
++ vcgt.u8 q13, q5, q1
++ vcgt.u8 q14, q6, q2
++ vcgt.u8 q15, q7, q3
++
++ vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0
++ vcgt.u8 q1, q5
++ vcgt.u8 q2, q6
++ vcgt.u8 q3, q7
++
++ vsub.s8 q0, q12 @ a = sign(c-a)
++ vsub.s8 q1, q13
++ vsub.s8 q2, q14
++ vsub.s8 q3, q15
++
++ vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0
++ vcgt.u8 q13, q5, q9
++ vcgt.u8 q14, q6, q10
++ vcgt.u8 q15, q7, q11
++
++ vsub.s8 q0, q12
++ vsub.s8 q1, q13
++ vsub.s8 q2, q14
++ vsub.s8 q3, q15
++
++ vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0
++ vcgt.u8 q13, q9, q5
++ vcgt.u8 q14, q10, q6
++ vcgt.u8 q15, q11, q7
++
++ vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b)
++ vadd.s8 q1, q13
++ vmov.u8 q12, #2
++ vadd.s8 q2, q14
++ vadd.s8 q3, q15
++
++ vadd.s8 q0, q12
++ vadd.s8 q1, q12
++
++ vld1.8 {d26, d27}, [r5]
++
++ vadd.s8 q2, q12
++ vuzp.8 q0, q1
++ vmov.u8 q15, #128
++ vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b)
++
++ vtbl.8 d0, {d26}, d0
++ vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add
++
++ vtbl.8 d1, {d26}, d1
++ vadd.s8 q14, q5, q15
++
++ vtbl.8 d2, {d27}, d2
++ vuzp.8 q2, q3
++
++ vtbl.8 d3, {d27}, d3
++
++ vtbl.8 d4, {d26}, d4
++ vzip.8 q0, q1
++
++ vtbl.8 d5, {d26}, d5
++ vqadd.s8 q0, q12
++ vqadd.s8 q1, q14
++ vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add
++
++ vtbl.8 d6, {d27}, d6
++ vtbl.8 d7, {d27}, d7
++ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add
++ vzip.8 q2, q3
++
++ vsub.s8 q0, q15
++ vqadd.s8 q2, q12
++ vqadd.s8 q3, q14
++ vsub.s8 q1, q15
++ vsub.s8 q2, q15
++ vsub.s8 q3, q15
++
++ bx lr
++endfunc
++
++@ r0 destination address
++@ r2 stride to post-increment r0 with
++@ r4 upper clip value
++@ [r5] translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27. For Y d26=d27
++
++function edge_64b_body_16
++
++ vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0
++ vcgt.u16 q13, q5, q1
++ vcgt.u16 q14, q6, q2
++ vcgt.u16 q15, q7, q3
++
++ vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0
++ vcgt.u16 q1, q1, q5
++ vcgt.u16 q2, q2, q6
++ vcgt.u16 q3, q3, q7
++
++ vsub.s16 q0, q0, q12 // a = sign(c-a)
++ vsub.s16 q1, q1, q13
++ vsub.s16 q2, q2, q14
++ vsub.s16 q3, q3, q15
++
++ vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0
++ vcgt.u16 q13, q5, q9
++ vcgt.u16 q14, q6, q10
++ vcgt.u16 q15, q7, q11
++
++ vsub.s16 q0, q0, q12
++ vsub.s16 q1, q1, q13
++ vsub.s16 q2, q2, q14
++ vsub.s16 q3, q3, q15
++
++ vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0
++ vcgt.u16 q13, q9, q5
++ vcgt.u16 q14, q10, q6
++ vcgt.u16 q15, q11, q7
++
++ vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b)
++ vadd.s16 q1, q1, q13
++ vadd.s16 q2, q2, q14
++ vadd.s16 q3, q3, q15
++
++ vmov.u8 q12, #2
++
++ vmovn.s16 d0, q0
++ vmovn.s16 d1, q1
++ vmovn.s16 d2, q2
++ vmovn.s16 d3, q3
++
++ vldr d26, [r5]
++
++ vuzp.8 q0, q1
++
++ vldr d27, [r5, #8]
++
++ vadd.s8 q0, q0, q12
++ vadd.s8 q1, q1, q12
++
++ vmov.i64 q12, #0
++
++ vtbl.8 d0, {d26}, d0
++ vtbl.8 d1, {d26}, d1
++ vtbl.8 d2, {d27}, d2
++ vtbl.8 d3, {d27}, d3
++
++ vdup.i16 q13, r4
++
++ vzip.8 q0, q1
++
++ @ Avoid overwrite whilst widening
++ vaddw.s8 q2, q6, d2
++ vaddw.s8 q3, q7, d3
++ vaddw.s8 q1, q5, d1
++ vaddw.s8 q0, q4, d0
++
++ @ now clip
++ clip16_4 q2, q3, q1, q0, q12, q13
++
++ bx lr
++endfunc
++
++
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3, q9, q10
++@
++@ d16, d17 (q8) xlat U, V
++@ q14.u8 #2
++@ q15.u8 #128
++
++function edge_16b_body_8
++ vcgt.u8 q9, q0, q1 @ a > c -> -1 , otherwise 0
++ vadd.u8 q9, q14, q9
++ vcgt.u8 q0, q1, q0 @ c > a -> -1 , otherwise 0
++ vsub.u8 q9, q9, q0
++ vcgt.u8 q0, q2, q1 @ c < b -> -1 , otherwise 0
++ vadd.u8 q9, q9, q0
++ vcgt.u8 q0, q1, q2 @ c > b -> -1 , otherwise 0
++ vsub.u8 q0, q9, q0
++
++ vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add
++
++ vuzp.8 d0, d1
++
++ vtbl.8 d0, {d16}, d0
++ vtbl.8 d1, {d17}, d1
++
++ vzip.8 d0, d1
++ vqadd.s8 q0, q3
++ vsub.s8 q0, q15
++
++ bx lr
++endfunc
++
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3
++@
++@ q12, #0
++@ d16, d17 xlat U, V
++@ q14.u8 #2
++@ q15.u16 max
++function edge_16b_body_16
++ vcgt.u16 q9, q0, q1 @ a > c -> -1 , otherwise 0
++ vadd.u16 q9, q14, q9
++ vcgt.u16 q0, q1, q0 @ c > a -> -1 , otherwise 0
++ vsub.u16 q9, q9, q0
++ vcgt.u16 q0, q2, q1 @ c < b -> -1 , otherwise 0
++ vadd.u16 q9, q9, q0
++ vcgt.u16 q0, q1, q2 @ c > b -> -1 , otherwise 0
++ vsub.u16 q0, q9, q0
++
++ vmovn.s16 d0, q0
++ @ d1 will have random contents that we transform but
++ @ that doesn't matter as we then discard them
++ vuzp.8 d0, d1
++
++ vtbl.8 d0, {d16}, d0
++ vtbl.8 d1, {d17}, d1
++
++ vzip.8 d0, d1
++
++ vaddw.s8 q0, q1, d0
++
++ @ now clip
++ vmax.s16 q0, q12
++ vmin.s16 q0, q15
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only
++@ int eo, [sp, #sp_base + 0]
++@ int width, [sp, #sp_base + 4]
++@ int height) [sp, #sp_base + 8]
++
++@ Jumps via jump_tab with
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ EDGE_SRC_STRIDE [r3]
++@ (1 << \bit_depth) - 1 [r4]
++@ * xlat_table [r5] // setup_64b only
++@ int height [r12]
++@
++@ 0 [q12] // > 8 bit
++@ 2 [q14]
++@ 128 [q15] // = 8 bit
++@ r4 [q15] // > 8 bit
++
++.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
++
++@ Build translate registers
++@ As translate values can only be 0-4 we don't care about junk in the rest
++@ of the register
++.if \is_chroma
++ ldr ip, [sp, #0]
++ push {r4-r6, lr} @ 16 bytes
++ vld1.8 {d16[2]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[2]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[0]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[0]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[1]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[1]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[3]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[3]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[4]}, [r3]
++ vld1.8 {d17[4]}, [ip]
++ movw r3, EDGE_SRC_STRIDE
++.set sp_base, 20
++.else
++ add ip, r3, #4
++ vld1.8 {d16[1]}, [r3]
++ add r3, r3, #2
++ vld1.8 {d17[0]}, [ip]
++ add ip, ip, #2
++ vld1.8 {d16[0]}, [r3]
++ add r3, r3, #6
++ vld1.8 {d17[1]}, [ip]
++ vld1.8 {d16[2]}, [r3]
++ movw r3, EDGE_SRC_STRIDE
++ push {r4-r6, lr} @ 16 bytes
++ vzip.8 d16, d17
++ vmov d17, d16
++.set sp_base, 16
++.endif
++
++@ If setup_64b we need the xlat table on the stack
++.if \setup_64b
++ sub r5, sp, #16
++.endif
++
++@ Get jump address
++@ We have a special case for width 4 as the calling code doesn't detect it
++@ If we may have w4 then we add a 2nd jump table after the 1st
++.if \check_w4
++ ldr r12, [sp, #sp_base + 4] @ width
++ adr r6, \jump_tab
++ ldr lr, [sp, #sp_base + 0] @ e0
++ cmp r12, #8
++ it lt
++ addlt r6, #16
++.else
++ ldr lr, [sp, #sp_base + 0] @ e0
++ adr r6, \jump_tab
++.endif
++
++ ldr r12, [sp, #sp_base + 8] @ height
++
++.if \bit_depth > 8
++ movw r4, (1 << \bit_depth) - 1
++.endif
++.if \setup_16b
++.if \bit_depth > 8
++ vmov.i64 q12, #0
++ vdup.16 q15, r4
++ vmov.u16 q14, #2
++.else
++ vmov.u8 q15, #128
++ vmov.u8 q14, #2
++.endif
++.endif
++
++@ If setup_64b we need q4-q7 saved.
++.if \setup_64b
++ vpush {q4-q8} @ 80 bytes, q8 pushed first
++.set sp_base, sp_base + 80
++.endif
++
++ ldr r6, [r6, lr, lsl #2]
++
++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
++.if \do2
++ push {r0, r1, r6, r12}
++.if jent_pic
++ bl 98f
++.else
++ blx r6
++.endif
++ pop {r0, r1, r6, r12}
++
++ add r0, #64
++ add r1, #64
++.endif
++
++.if jent_pic
++ bl 98f
++.else
++ blx r6
++.endif
++
++@ Tidy up & return
++.if \setup_64b
++ vpop {q4-q8} @ spurious but harmless load of q8
++.endif
++ pop {r4-r6, pc}
++
++.if jent_pic && !\xjump
++@ Magic label - used as 98b in jent macro
++98:
++ add pc, r6
++.endif
++.endm
++
++
++.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
++.endm
++
++.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
++.endm
++
++
++.macro edge_64b_e0, body_fn, pb
++ sub r1, #8
++ mov r6, lr
++1: vldm r1, {d7-d16}
++ // load a
++ vext.8 q0, q3, q4, #(16 - \pb)
++ add r1, r3
++ vext.8 q1, q4, q5, #(16 - \pb)
++ subs r12, #1
++ vext.8 q2, q5, q6, #(16 - \pb)
++ vext.8 q3, q6, q7, #(16 - \pb)
++ pld [r1]
++ // load b
++ vext.8 q11, q7, q8, #\pb @ Avoid overwrite
++ pld [r1, #64]
++ vext.8 q8, q4, q5, #\pb
++ vext.8 q9, q5, q6, #\pb
++ vext.8 q10, q6, q7, #\pb
++ bl \body_fn
++ vstm r0, {q0-q3}
++ add r0, r0, r2
++ bgt 1b
++ bx r6
++.endm
++
++.macro edge_32bx2_e0, body_fn, pb
++ add r6, r1, r3
++ push {r7,lr}
++ sub r1, #8
++ add r7, r0, r2
++ lsl r2, #1
++1: vldmia r1, {d7-d12}
++ // load a
++ vext.8 q0, q3, q4, #16 - \pb
++ add r1, r1, r3, lsl #1
++ vext.8 q1, q4, q5, #16 - \pb
++ subs r12, #2
++ // load b
++ vext.8 q8, q4, q5, #\pb
++ vext.8 q9, q5, q6, #\pb
++ vldr d25, [r6, #-8]
++ vldmia r6, {d12-d15}
++ vldr d26, [r6, #32]
++ // load a
++ vext.8 q2, q12, q6, #16 - \pb
++ add r6, r6, r3, lsl #1
++ vext.8 q3, q6, q7, #16 - \pb
++ // load b
++ vext.8 q10, q6, q7, #\pb
++ vext.8 q11, q7, q13, #\pb
++ bl \body_fn
++ vst1.8 {q0-q1}, [r0, :256], r2
++ vst1.8 {q2-q3}, [r7, :256], r2
++ bgt 1b
++ pop {r7,pc}
++.endm
++
++.macro edge_16b_e0, body_fn, pb
++ sub r1, #8
++ mov r6, lr
++1: vldmia r1, {d1-d4}
++ add r1, r3
++ subs r12, #1
++ vext.8 q0, q0, q1, #16 - \pb
++ vext.8 q2, q1, q2, #\pb
++
++ bl \body_fn
++ vst1.8 {q0}, [r0, :128], r2
++ bgt 1b
++ bx r6
++.endm
++
++.macro edge_8bx2_e0, body_fn, pb
++ add r6, r1, r3
++ push {r7,lr}
++ sub r1, #8
++ add r7, r0, r2
++ lsl r2, #1
++1: vldmia r1, {d1-d2}
++ vldmia r6, {d3-d4}
++ vldr d6, [r1, #16]
++ subs r12, #2
++ vldr d7, [r6, #-8]
++ add r1, r1, r3, lsl #1
++ vext.8 d0, d1, d2, #8 - \pb
++ add r6, r6, r3, lsl #1
++ vext.8 d5, d3, d4, #\pb
++ vext.8 d4, d2, d6, #\pb
++ vext.8 d1, d7, d3, #8 - \pb
++
++ bl \body_fn
++ vst1.8 {d0}, [r0, :64], r2
++ vst1.8 {d1}, [r7, :64], r2
++ bgt 1b
++ pop {r7,pc}
++.endm
++
++.macro edge_4bx4_e0, body_fn, pb
++ add r6, r1, r3
++ push {r7,lr}
++ add r7, r0, r2
++ lsl r2, #1
++
++ tst r1, #4
++ bne 2f
++1: // r1 (and assumed r6) are 64-bit aligned
++ vldr d2, [r1]
++ vldr d0, [r1, #-8]
++ add r1, r1, r3, lsl #1
++ vldr d20, [r6]
++ subs r12, #4
++ vldr d18, [r6, #-8]
++ add r6, r6, r3, lsl #1
++ vldr d3, [r1]
++ vshr.u64 d4, d2, #\pb * 8
++ vldr d1, [r1, #-8]
++ add r1, r1, r3, lsl #1
++ vldr d21, [r6]
++ vext.8 d0, d0, d2, #8 - \pb
++ vldr d19, [r6,#-8]
++ add r6, r6, r3, lsl #1
++ vshr.u64 d22, d20, #\pb * 8
++ vext.8 d18, d18, d20, #8 - \pb
++ vshr.u64 d5, d3, #\pb * 8
++ vext.8 d1, d1, d3, #8 - \pb
++ vshr.u64 d23, d21, #\pb * 8
++ vext.8 d19, d19, d21, #8 - \pb
++ vsli.64 q1, q10, #32
++ vsli.64 q2, q11, #32
++ vsli.64 q0, q9, #32
++
++ bl \body_fn
++ vst1.32 {d0[0]}, [r0, :32], r2
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vst1.32 {d1[0]}, [r0, :32], r2
++ vst1.32 {d1[1]}, [r7, :32], r2
++ bgt 1b
++ pop {r7,pc}
++
++2: // r1 (and assumed r6) are 32-bit but not 64-bit aligned
++ vldr d20, [r1, #-4]
++ vldr d22, [r1, #4]
++ add r1, r1, r3, lsl #1
++ vldr d2, [r6, #-4]
++ subs r12, #4
++ vldr d4, [r6, #4]
++ add r6, r6, r3, lsl #1
++ vldr d21, [r1, #-4]
++ vshl.i64 d18, d20, #\pb * 8
++ vldr d23, [r1, #4]
++ add r1, r1, r3, lsl #1
++ vldr d3, [r6, #-4]
++ vext.8 d22, d20, d22, #\pb
++ vldr d5, [r6, #4]
++ add r6, r6, r3, lsl #1
++ vshl.i64 d0, d2, #\pb * 8
++ vext.8 d4, d2, d4, #\pb
++ vshl.i64 d19, d21, #\pb * 8
++ vext.8 d23, d21, d23, #\pb
++ vshl.i64 d1, d3, #\pb * 8
++ vext.8 d5, d3, d5, #\pb
++ vsri.64 q1, q10, #32
++ vsri.64 q0, q9, #32
++ vsri.64 q2, q11, #32
++
++ bl \body_fn
++ vst1.32 {d0[0]}, [r0, :32], r2
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vst1.32 {d1[0]}, [r0, :32], r2
++ vst1.32 {d1[1]}, [r7, :32], r2
++ bgt 2b
++ pop {r7,pc}
++.endm
++
++
++.macro edge_64b_e1, body_fn
++ sub r1, r3
++ push {lr}
++ add r6, r1, #32
++ // load a
++ vld1.8 {q0-q1}, [r1, :256], r3
++ vld1.8 {q2-q3}, [r6, :256], r3
++ // load c
++ vld1.8 {q4-q5}, [r1, :256], r3
++ vld1.8 {q6-q7}, [r6, :256], r3
++1: // load b
++ vld1.8 {q8-q9}, [r1, :256], r3
++ subs r12, #1
++ vld1.8 {q10-q11}, [r6, :256], r3
++ bl \body_fn
++ vstm r0, {q0-q3}
++ // copy c to a
++ vmov.64 q0, q4
++ pld [r1, r3]
++ vmov.64 q1, q5
++ it le
++ pople {lr}
++ vmov.64 q2, q6
++ it le
++ bxle lr
++ vmov.64 q3, q7
++ add r0, r0, r2
++ // copy b to c
++ vmov.64 q4, q8
++ vmov.64 q5, q9
++ vmov.64 q6, q10
++ vmov.64 q7, q11
++ b 1b
++.endm
++
++.macro edge_32bx2_e1, body_fn
++ sub r6, r1, r3
++ vld1.8 {q2-q3}, [r1, :256], r3
++ vld1.8 {q0-q1}, [r6, :256]
++ mov r6, lr
++
++1: @ Given the data duplication here we could obviously do better than
++ @ using the generic body_fn but it almost certainly isn't worth it
++ vld1.8 {q8-q9}, [r1, :256], r3
++ subs r12, #2
++ vmov q4, q2
++ vmov q5, q3
++ vld1.8 {q10-q11}, [r1, :256], r3
++ vmov q6, q8
++ vmov q7, q9
++
++ bl \body_fn
++
++ vst1.8 {q0-q1}, [r0, :256], r2
++ // copy b to a
++ vmov q0, q8
++ vmov q1, q9
++ vst1.8 {q2-q3}, [r0, :256], r2
++ vmov q2, q10
++ it le
++ bxle r6
++ vmov q3, q11
++ b 1b
++.endm
++
++.macro edge_16b_e1, body_fn
++ sub r6, r1, r3
++ // load c
++ vld1.8 {q1}, [r1, :128], r3
++ // load a
++ vld1.8 {q0}, [r6, :128]
++ mov r6, lr
++1: // load b
++ vld1.8 {q2}, [r1, :128], r3
++ bl \body_fn
++ vst1.8 {q0}, [r0, :128], r2
++ subs r12, #1
++ // copy c to a
++ vmov.64 q0, q1
++ it le
++ bxle r6
++ // copy b to c
++ vmov.64 q1, q2
++ b 1b
++.endm
++
++.macro edge_8bx2_e1, body_fn
++ sub r6, r1, r3
++ lsl r3, #1
++ push {r7, lr}
++ vld1.8 {d1}, [r1, :64], r3
++ vld1.8 {d0}, [r6, :64], r3
++ add r7, r0, r2
++ lsl r2, #1
++1: @ Given the data duplication here we could obviously do better than
++ @ using the generic body_fn but it almost certainly isn't worth it
++ vld1.8 {d4}, [r6, :64], r3
++ vmov d2, d1
++ vld1.8 {d5}, [r1, :64], r3
++ subs r12, #2
++ vmov d3, d4
++
++ bl \body_fn
++
++ vst1.8 {d0}, [r0, :64], r2
++ vst1.8 {d1}, [r7, :64], r2
++
++ // copy b to a
++ vmov q0, q2
++ bgt 1b
++ pop {r7, pc}
++.endm
++
++.macro edge_4bx4_e1, body_fn
++ sub r6, r1, r3
++ lsl r3, #1
++ push {r7, lr}
++ vld1.32 {d0[1]}, [r1, :32], r3
++ add r7, r0, r2
++ vld1.32 {d0[0]}, [r6, :32], r3
++ lsl r2, #1
++ vld1.32 {d4[1]}, [r1, :32], r3
++ vld1.32 {d4[0]}, [r6, :32], r3
++ vld1.32 {d5[1]}, [r1, :32], r3
++ vld1.32 {d5[0]}, [r6, :32], r3
++ vmov d1, d4
++ vext.32 d2, d0, d4, #1
++ subs r12, #4
++ vmov d22, d5
++ vext.32 d3, d4, d5, #1
++ b 2f
++
++1: vst1.32 {d0[0]}, [r0, :32], r2
++ vext.32 d2, d22, d4, #1
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vmov d0, d22
++ vst1.32 {d1[0]}, [r0, :32], r2
++ vext.32 d3, d4, d5, #1
++ vst1.32 {d1[1]}, [r7, :32], r2
++ vmov d1, d4
++ vmov d22, d5
++2: @ Given the data duplication here we could probably do better than
++ @ using the generic body_fn but it almost certainly isn't worth it
++ bl \body_fn
++ ble 3f
++ vld1.32 {d4[0]}, [r6, :32], r3
++ subs r12, #4
++ vld1.32 {d4[1]}, [r1, :32], r3
++ vld1.32 {d5[0]}, [r6, :32], r3
++ vld1.32 {d5[1]}, [r1, :32], r3
++ b 1b
++
++3: vst1.32 {d0[0]}, [r0, :32], r2
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vst1.32 {d1[0]}, [r0, :32]
++ vst1.32 {d1[1]}, [r7, :32]
++ pop {r7, pc}
++.endm
++
++.macro edge_64b_e2, body_fn, pb
++ push {lr}
++ sub r6, r1, r3
++ // load c and a
++ vld1.8 {q4-q5}, [r1, :128]
++ vldr d25, [r6, #-8]
++ vldmia r6, {d16-d23}
++ vext.8 q0, q12, q8, #16 - \pb
++ add r6, r1, #32
++ vext.8 q1, q8, q9, #16 - \pb
++ add r1, r1, r3
++ vext.8 q2, q9, q10, #16 - \pb
++ vld1.8 {q6-q7}, [r6, :128]
++ sub r6, r1, r3
++ vext.8 q3, q10, q11, #16 - \pb
++
++1: // load b
++ vldmia r1, {d16-d24}
++ vext.8 q8, q8, q9, #\pb
++ pld [r1, r3]
++ vext.8 q9, q9, q10, #\pb
++ subs r12, #1
++ vext.8 q10, q10, q11, #\pb
++ vext.8 q11, q11, q12, #\pb
++ bl \body_fn
++ // next a is mostly available in c
++ vldr d25, [r6, #-8]
++ vstmia r0, {q0-q3}
++ vext.8 q3, q6, q7, #16 - \pb
++ it le
++ pople {lr}
++ vext.8 q2, q5, q6, #16 - \pb
++ it le
++ bxle lr
++ vext.8 q1, q4, q5, #16 - \pb
++ add r6, r6, r3
++ vext.8 q0, q12, q4, #16 - \pb
++ add r0, r0, r2
++ // next c is mostly available in b
++ vldr d8, [r1]
++ vext.8 d9, d16, d17, #8 - \pb
++ vext.8 q5, q8, q9, #16 - \pb
++ add r1, r1, r3
++ vext.8 q6, q9, q10, #16 - \pb
++ pld [r6, #-8]
++ vext.8 q7, q10, q11, #16 - \pb
++ b 1b
++.endm
++
++.macro edge_32bx2_e2, body_fn, pb
++ sub r6, r1, r3
++ push {r7, lr}
++ add r7, r0, r2
++ lsl r2, #1
++ // load a and first 32b of c
++ vld1.8 {q4-q5}, [r1, :256]
++ vldr d25, [r6, #-8]
++ vld1.8 {q13-q14}, [r6, :256]
++ vldr d31, [r1, #-8]
++ add r6, r6, r3, lsl #1
++ vext.8 q0, q12, q13, #16 - \pb
++ add r1, r1, r3, lsl #1
++ vext.8 q1, q13, q14, #16 - \pb
++ vext.8 q2, q15, q4, #16 - \pb
++ vext.8 q3, q4, q5, #16 - \pb
++1:
++ // load second 32b of c and second 32b of b
++ vldmia r6, {d12-d16}
++ vldmia r1, {d20-d24}
++ // first 32b of b is mostly available in second 32b of c
++ vext.8 q9, q7, q8, #\pb
++ subs r12, #2
++ vext.8 q8, q6, q7, #\pb
++ vext.8 q10, q10, q11, #\pb
++ vext.8 q11, q11, q12, #\pb
++
++ bl \body_fn
++
++ vst1.8 {q0-q1}, [r0, :256], r2
++ vst1.8 {q2-q3}, [r7, :256], r2
++ ble 2f
++
++ vldr d25, [r6, #-8]
++ add r6, r6, r3, lsl #1
++ vldr d8, [r1]
++ vext.8 d9, d20, d21, #8 - \pb
++ vldr d31, [r1, #-8]
++ add r1, r1, r3, lsl #1
++ // first 32b of a is mostly available in second 32b of c
++ vext.8 q1, q6, q7, #16 - \pb
++ vext.8 q0, q12, q6, #16 - \pb
++ // first 32b of c is mostly available in second 32b of b
++ vext.8 q5, q10, q11, #16 - \pb
++ // second 32b of a is mostly available in first 32b of c
++ vext.8 q2, q15, q4, #16 - \pb
++ vext.8 q3, q4, q5, #16 - \pb
++ b 1b
++
++2: pop {r7, pc}
++.endm
++
++.macro edge_16b_e2, body_fn, pb
++ push {lr}
++ sub r6, r1, r3
++ vld1.8 {q1}, [r1, :128], r3
++ vldr d19, [r6, #-8]
++ vld1.8 {q10}, [r6, :128], r3
++
++1: vldmia r1, {d4-d6}
++ vext.8 q0, q9, q10, #16 - \pb
++ subs r12, #1
++ vext.8 q2, q2, q3, #\pb
++ bl \body_fn
++ vst1.8 {q0}, [r0, :128], r2
++ ble 2f
++ vmov q10, q1
++ vldr d2, [r1]
++ add r1, r1, r3
++ vldr d19, [r6, #-8]
++ add r6, r6, r3
++ vext.8 d3, d4, d5, #8 - \pb
++ b 1b
++
++2: pop {pc}
++.endm
++
++.macro edge_8bx2_e2, body_fn, pb
++ sub r6, r1, r3
++ push {r7, lr}
++ add r7, r0, r2
++ lsl r2, #1
++ vldr d18, [r6, #-8]
++ vldr d19, [r6]
++ add r6, r6, r3, lsl #1
++ vldr d20, [r1, #-8]
++ vldr d2, [r1]
++ add r1, r1, r3, lsl #1
++ vldmia r6, {d3-d4}
++ vld1.8 {d21-d22}, [r1, :128]
++
++1: vext.8 d0, d18, d19, #8 - \pb
++ vext.8 d4, d3, d4, #\pb
++ vext.8 d1, d20, d2, #8 - \pb
++ subs r12, #2
++ vext.8 d5, d21, d22, #\pb
++
++ bl \body_fn
++
++ vst1.8 {d0}, [r0, :64], r2
++ vst1.8 {d1}, [r7, :64], r2
++ ble 2f
++
++ vldr d18, [r6, #-8]
++ add r6, r6, r3, lsl #1
++ vldr d20, [r1, #-8]
++ vmov d19, d3
++ vldr d2, [r1]
++ add r1, r1, r3, lsl #1
++ vldmia r6, {d3-d4}
++ vld1.8 {d21-d22}, [r1, :128]
++ b 1b
++
++2: pop {r7, pc}
++.endm
++
++.macro edge_4bx4_e2, body_fn, pb
++ sub r6, r1, r3
++ push {r7-r9, lr}
++ add r8, r1, r3
++ sub r6, r6, #\pb
++ add r8, r8, #\pb
++ add r7, r0, r2
++ lsl r2, #1
++
++1: vld1.32 {d0[0]}, [r6], r3
++ subs r12, #4
++ vld1.32 {d2[0]}, [r1], r3
++ vld1.32 {d4[0]}, [r8], r3
++ vld1.32 {d0[1]}, [r6], r3
++ vld1.32 {d2[1]}, [r1], r3
++ vld1.32 {d4[1]}, [r8], r3
++ vld1.32 {d1[0]}, [r6], r3
++ vld1.32 {d3[0]}, [r1], r3
++ vld1.32 {d5[0]}, [r8], r3
++ vld1.32 {d1[1]}, [r6], r3
++ vld1.32 {d3[1]}, [r1], r3
++ vld1.32 {d5[1]}, [r8], r3
++
++ bl \body_fn
++
++ vst1.32 {d0[0]}, [r0, :32], r2
++ vst1.32 {d0[1]}, [r7, :32], r2
++ vst1.32 {d1[0]}, [r0, :32], r2
++ vst1.32 {d1[1]}, [r7, :32], r2
++ bgt 1b
++
++ pop {r7-r9,pc}
++.endm
++
++.macro edge_64b_e3, body_fn, pb
++ push {lr}
++ sub r6, r1, r3
++ // load c and a
++ vld1.8 {q4-q5}, [r1, :128]
++ vldmia r6, {d16-d24}
++ vext.8 q0, q8, q9, #\pb
++ add r6, r1, #32
++ vext.8 q1, q9, q10, #\pb
++ add r1, r1, r3
++ vext.8 q2, q10, q11, #\pb
++ vld1.8 {q6-q7}, [r6, :128]
++ sub r6, r1, r3
++ vext.8 q3, q11, q12, #\pb
++
++1: // load b
++ vldr d17, [r1, #-8]
++ vldmia r1, {d18-d25}
++ vext.8 q8, q8, q9, #16 - \pb
++ pld [r1, r3]
++ vext.8 q9, q9, q10, #16 - \pb
++ subs r12, #1
++ vext.8 q10, q10, q11, #16 - \pb
++ vext.8 q11, q11, q12, #16 - \pb
++ bl \body_fn
++ // next a is mostly available in c
++ vldr d24, [r6, #64]
++ vstmia r0, {q0-q3}
++ vext.8 q0, q4, q5, #\pb
++ it le
++ pople {lr}
++ vext.8 q1, q5, q6, #\pb
++ it le
++ bxle lr
++ vext.8 q2, q6, q7, #\pb
++ add r6, r6, r3
++ vext.8 q3, q7, q12, #\pb
++ add r0, r0, r2
++ // next c is mostly available in b
++ vext.8 d14, d22, d23, #\pb
++ vldr d15, [r1, #56]
++ vext.8 q4, q8, q9, #\pb
++ add r1, r1, r3
++ vext.8 q5, q9, q10, #\pb
++ vext.8 q6, q10, q11, #\pb
++ b 1b
++.endm
++
++.macro edge_32bx2_e3, body_fn, pb
++ sub r6, r1, r3
++ push {r7, lr}
++ add r7, r0, r2
++ lsl r2, #1
++ // load a and first 32b of c
++ vldmia r1, {d8-d12}
++ vldmia r6, {d24-d28}
++ vext.8 q2, q4, q5, #\pb
++ add r6, r6, r3, lsl #1
++ vext.8 q3, q5, q6, #\pb
++ add r1, r1, r3, lsl #1
++ vext.8 q0, q12, q13, #\pb
++ vext.8 q1, q13, q14, #\pb
++1:
++ // load second 32b of c and second 32b of b
++ vldr d25, [r6, #-8]
++ subs r12, #2
++ vldmia r6, {d12-d15}
++ vldr d27, [r1, #-8]
++ vldmia r1, {d20-d23}
++ // first 32b of b is mostly available in second 32b of c
++ vext.8 q8, q12, q6, #16 - \pb
++ vext.8 q9, q6, q7, #16 - \pb
++ vext.8 q11, q10, q11, #16 - \pb
++ vext.8 q10, q13, q10, #16 - \pb
++
++ bl \body_fn
++
++ vst1.8 {q0-q1}, [r0, :256], r2
++ vst1.8 {q2-q3}, [r7, :256], r2
++ ble 2f
++
++ vldr d24, [r6, #32]
++ add r6, r6, r3, lsl #1
++ vldr d11, [r1, #24]
++ vext.8 d10, d22, d23, #\pb
++ vldr d30, [r1, #32]
++ add r1, r1, r3, lsl #1
++ // first 32b of a is mostly available in second 32b of c
++ vext.8 q0, q6, q7, #\pb
++ vext.8 q1, q7, q12, #\pb
++ // first 32b of c is mostly available in second 32b of b
++ vext.8 q4, q10, q11, #\pb
++ // second 32b of a is mostly available in first 32b of c
++ vext.8 q3, q5, q15, #\pb
++ vext.8 q2, q4, q5, #\pb
++ b 1b
++
++2: pop {r7, pc}
++.endm
++
++.macro edge_16b_e3, body_fn, pb
++ push {lr}
++ sub r6, r1, r3
++ vld1.8 {q1}, [r1, :128], r3
++ vldmia r6, {d18-d20}
++ add r6, r6, r3
++
++1: vldr d5, [r1, #-8]
++ vld1.8 {q3}, [r1, :128]
++ subs r12, #1
++ vext.8 q0, q9, q10, #\pb
++ vext.8 q2, q2, q3, #16 - \pb
++ bl \body_fn
++ vst1.8 {q0}, [r0, :128], r2
++ ble 2f
++ vmov q9, q1
++ vldr d3, [r1, #8]
++ add r1, r1, r3
++ vldr d20, [r6, #16]
++ add r6, r6, r3
++ vext.8 d2, d4, d5, #\pb
++ b 1b
++
++2: pop {pc}
++.endm
++
++.macro edge_8bx2_e3, body_fn, pb
++ sub r6, r1, r3
++ push {r7, lr}
++ add r7, r0, r2
++ lsl r2, #1
++ vld1.8 {d18-d19}, [r6]
++ add r6, r6, r3, lsl #1
++ vldr d20, [r1, #8]
++ vldr d2, [r1]
++ add r1, r1, r3, lsl #1
++ vldr d4, [r6, #-8]
++ vldr d3, [r6]
++ vldr d21, [r1, #-8]
++ vldr d22, [r1]
++
++1: vext.8 d0, d18, d19, #\pb
++ vext.8 d4, d4, d3, #8 - \pb
++ vext.8 d1, d2, d20, #\pb
++ subs r12, #2
++ vext.8 d5, d21, d22, #8 - \pb
++
++ bl \body_fn
++
++ vst1.8 {d0}, [r0, :64], r2
++ vst1.8 {d1}, [r7, :64], r2
++ ble 2f
++
++ vldr d19, [r6, #8]
++ add r6, r6, r3, lsl #1
++ vldr d20, [r1, #8]
++ vmov d18, d3
++ vldr d2, [r1]
++ add r1, r1, r3, lsl #1
++ vldr d4, [r6, #-8]
++ vldr d3, [r6]
++ vldr d21, [r1, #-8]
++ vldr d22, [r1]
++ b 1b
++
++2: pop {r7, pc}
++.endm
++
++.macro edge_4bx4_e3, body_fn, pb
++ @ e3 is the same as e2 but with the X offset reversed
++ edge_4bx4_e2 \body_fn, (-\pb)
++.endm
++
++@ Jump table entry - if in neon mode the bottom bit must be set
++@ ? There is probably a real asm instruction to do this but I haven't found it
++.macro jent lab
++.if jent_pic
++@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
++@ simpler and clearer in the code to stick with .word
++T .word (0 + \lab) - (4 + 98b)
++A .word (0 + \lab) - (8 + 98b)
++.else
++T .word 1 + \lab
++A .word \lab
++.endif
++.endm
++
++.macro edge_64b_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++
++0: edge_64b_e0 \body_fn, \pb
++10: edge_64b_e1 \body_fn
++20: edge_64b_e2 \body_fn, \pb
++30: edge_64b_e3 \body_fn, \pb
++.endm
++
++.macro edge_32bx2_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++
++0: edge_32bx2_e0 \body_fn, \pb
++10: edge_32bx2_e1 \body_fn
++20: edge_32bx2_e2 \body_fn, \pb
++30: edge_32bx2_e3 \body_fn, \pb
++.endm
++
++.macro edge_16b_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++
++0: edge_16b_e0 \body_fn, \pb
++10: edge_16b_e1 \body_fn
++20: edge_16b_e2 \body_fn, \pb
++30: edge_16b_e3 \body_fn, \pb
++.endm
++
++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++ jent 5f
++ jent 15f
++ jent 25f
++ jent 35f
++
++0: edge_32bx2_e0 \body_fn_64b, \pb
++10: edge_32bx2_e1 \body_fn_64b
++20: edge_32bx2_e2 \body_fn_64b, \pb
++30: edge_32bx2_e3 \body_fn_64b, \pb
++5: edge_16b_e0 \body_fn_16b, \pb
++15: edge_16b_e1 \body_fn_16b
++25: edge_16b_e2 \body_fn_16b, \pb
++35: edge_16b_e3 \body_fn_16b, \pb
++.endm
++
++.macro edge_16b_8bx2_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++ jent 5f
++ jent 15f
++ jent 25f
++ jent 35f
++
++0: edge_16b_e0 \body_fn, \pb
++10: edge_16b_e1 \body_fn
++20: edge_16b_e2 \body_fn, \pb
++30: edge_16b_e3 \body_fn, \pb
++5: edge_8bx2_e0 \body_fn, \pb
++15: edge_8bx2_e1 \body_fn
++25: edge_8bx2_e2 \body_fn, \pb
++35: edge_8bx2_e3 \body_fn, \pb
++.endm
++
++.macro edge_8bx2_4bx4_bodies, body_fn, pb
++ jent 0f
++ jent 10f
++ jent 20f
++ jent 30f
++ jent 5f
++ jent 15f
++ jent 25f
++ jent 35f
++
++0: edge_8bx2_e0 \body_fn, \pb
++10: edge_8bx2_e1 \body_fn
++20: edge_8bx2_e2 \body_fn, \pb
++30: edge_8bx2_e3 \body_fn, \pb
++5: edge_4bx4_e0 \body_fn, \pb
++15: edge_4bx4_e1 \body_fn
++25: edge_4bx4_e2 \body_fn, \pb
++35: edge_4bx4_e3 \body_fn, \pb
++.endm
++
++@ void ff_hevc_rpi_sao_edge_8_neon_8(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_8_neon_8, export=1
++ edge_16b_init 8, 0, 1, 99f
++99:
++ edge_8bx2_4bx4_bodies edge_16b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_16_neon_8(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_16_neon_8, export=1
++ edge_16b_init 8, 0, 0, 99f
++99:
++ edge_16b_bodies edge_16b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_32_neon_8(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_32_neon_8, export=1
++ edge_64b_init 8, 0, 0, 99f
++99:
++ edge_32bx2_bodies edge_64b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_64_neon_8(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_64_neon_8, export=1
++ edge_64b_init 8, 0, 0, 99f
++99:
++ edge_64b_bodies edge_64b_body_8, 1
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_8_neon_8(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
++ edge_16b_init 8, 1, 1, 99f
++99:
++ edge_16b_8bx2_bodies edge_16b_body_8, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_16_neon_8(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
++ edge_64b_init 8, 1, 0, 99f
++99:
++ edge_32bx2_bodies edge_64b_body_8, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_32_neon_8(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
++ edge_64b_init 8, 1, 0, 99f
++99:
++ edge_64b_bodies edge_64b_body_8, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_8_neon_10(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_8_neon_10, export=1
++ edge_16b_init 10, 0, 1, 99f
++99:
++ edge_16b_8bx2_bodies edge_16b_body_16, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_16_neon_10(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_16_neon_10, export=1
++ edge_64b_init 10, 0, 0, 99f
++99:
++ edge_32bx2_bodies edge_64b_body_16, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_64_neon_10(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++@ We simply split the 32 case into 2 vertical stripes
++@ and call the fns for w32
++@
++@ Calling code will always have src != dst so we don't have to worry
++@ about edge effects
++
++function ff_hevc_rpi_sao_edge_64_neon_10, export=1
++ edge_64b_init 10, 0, 1, 99f, xjump=1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_32_neon_10(
++@ uint8_t *_dst, [r0]
++@ uint8_t *_src, [r1]
++@ int stride_dst, [r2]
++@ int16_t *_sao_offset_val, [r3]
++@ int eo, [sp, #0]
++@ int width, [sp, #4]
++@ int height) [sp, #8]
++
++function ff_hevc_rpi_sao_edge_32_neon_10, export=1
++ edge_64b_init 10, 0, 0, 99f
++99:
++ edge_64b_bodies edge_64b_body_16, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_8_neon_10(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
++ edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
++99:
++ edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_32_neon_10(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
++ edge_64b_init 10, 1, 1, 99f, xjump=1
++endfunc
++
++
++@ ff_hevc_rpi_sao_edge_c_16_neon_10(
++@ uint8_t *_dst, [r0]
++@ const uint8_t *_src, [r1]
++@ ptrdiff_t stride_dst, [r2]
++@ const int16_t *_sao_offset_val_u, [r3]
++@ const int16_t *_sao_offset_val_v, [sp, #0]
++@ int eo, [sp, #4]
++@ int width, [sp, #8]
++@ int height) [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
++ edge_64b_init 10, 1, 0, 99f
++99:
++ edge_64b_bodies edge_64b_body_16, 4
++endfunc
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_arm.h
+@@ -0,0 +1,28 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
++#define AVCODEC_ARM_HEVCPRED_ARM_H
++
++#include "libavcodec/rpi_hevcpred.h"
++
++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
+@@ -0,0 +1,35 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/arm/cpu.h"
++
++#include "libavcodec/rpi_hevcpred.h"
++#include "rpi_hevcpred_arm.h"
++
++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
++{
++ int cpu_flags = av_get_cpu_flags();
++
++ if (have_neon(cpu_flags))
++ ff_hevc_rpi_pred_init_neon(c, bit_depth);
++}
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
+@@ -0,0 +1,210 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcpred_arm.h"
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
++
++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
++{
++ switch (bit_depth)
++ {
++ case 8:
++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8
++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++
++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
++
++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
++
++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
++
++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
++
++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8;
++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8;
++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8;
++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8;
++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
++ break;
++ case 10:
++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++ c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
++
++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
++
++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
++
++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
++
++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
++
++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10;
++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10;
++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10;
++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10;
++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
++ break;
++ default:
++ break;
++ }
++}
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+@@ -0,0 +1,2984 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++/*
++ * General angular pred
++ *
++ * Horizontal (10) & Vertical (26) cases have their own file
++ * and are not dealt with properly here (luma filtering is missing)
++ *
++ * The inv_angle calculations are annoying - if it wasn't for the +128
++ * rounding step then the result would simply be the loop counter :-(
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.text
++
++@ Horizontal Patch functions
++@ These need a transpose before store so exist as smaller patches
++@ Patches can be called repeatedly without any intermediate setup
++@ to generate a horizontal block
++@
++@ It is almost certainly the case that larger patch fns can be built
++@ and they would be a little faster, but we would still need the small
++@ fns and code size (or at least instruction cache size) is an issue
++@ given how much code we already have here
++
++@ Generate 8x8 luma 8 patch
++@
++@ r3 Out stride
++@ r4 Angle add
++@ r7 Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2 Left ptr - updated
++@ r10 Inv angle accumulator (_up only)
++@ r12 32 - angle frac (_down) or angle frac (_up)
++@ d0 Older reference samples
++@ d1=r8+r9 Newer reference samples
++@ d2 32 - angle frac
++@ d3 Angle frac
++@ q2 Partially computed next result (_up only)
++@
++@ Temps
++@ r5 Loop counter
++@ r6
++@ r7 (_down only)
++@ r11 (_up only)
++@ q2, q8-q11
++
++patch_h_down_8x8_8:
++ ldrd r8, r9, [r2] @ Left
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r6
++ lsr r8, #8
++ vdup.8 d2, r12
++ orr r8, r8, r9, lsl #24
++ ldr r9, [r2, #5]!
++ vmov d1, r8, r9
++ // drop through...
++patch_h_down_8x8_8_continue:
++ mov r5, #8
++1:
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ vext.8 q8, q8, q9, #8
++ itt mi
++ lsrmi r7, r8, #8
++ vmovmi d0, r8, r9
++ vdup.8 d2, r12
++ vext.8 q9, q9, q10, #8
++ it mi
++ orrmi r8, r7, r9, lsl #24
++ vext.8 q10, q10, q11, #8
++ it mi
++ ldrmi r9, [r2, #1]!
++ vmov d22, d23
++ vrshrn.u16 d23, q2, #5
++ it mi
++ vmovmi d1, r8, r9
++ subs r5, #1
++ vdup.8 d3, r6
++ bne 1b
++ // drop through...
++store_tran_8x8_8:
++ vzip.8 d16, d17
++ add r6, r0, r3
++ vzip.8 d18, d19
++ lsl r3, #1
++ vzip.8 d20, d21
++ add r5, r0, r3
++ vzip.8 d22, d23
++ vzip.16 q8, q9
++ vzip.16 q10, q11
++ vzip.32 q8, q10
++ vzip.32 q9, q11
++ vst1.8 {d16}, [r0]!
++ vst1.8 {d17}, [r6], r3
++ vst1.8 {d20}, [r5], r3
++ vst1.8 {d21}, [r6], r3
++ vst1.8 {d18}, [r5], r3
++ vst1.8 {d19}, [r6], r3
++ vst1.8 {d22}, [r5]
++ asr r3, #1
++ vst1.8 {d23}, [r6]
++
++ bx lr
++
++patch_h_up_8x8_8:
++ ldrd r8, r9, [r2]
++ rsb r6, r4, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r4
++ lsr r11, r8, #24
++ vdup.8 d2, r6
++ ldr r8, [r2, #-1]!
++ orr r9, r11, r9, lsl #8
++ vmov d1, r8, r9
++ mov r12, r4
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++patch_h_up_8x8_8_continue:
++ mov r5, #8
++1:
++ add r12, r4
++ mov r11, #0
++ cmp r12, #33
++ it cs
++ addcs r10, r7
++ vext.8 q8, q8, q9, #8
++ itt cs
++ subcs r12, #32
++ tstcs r10, #1<<31
++ rsb r6, r12, #32
++ it eq
++ asreq r11, r10, #8
++ it cs
++ vmovcs d0, r8, r9
++ vdup.8 d2, r6
++ it cs
++ lsrcs r6, r8, #24
++ vext.8 q9, q9, q10, #8
++ itt cs
++ orrcs r9, r6, r9, lsl #8
++ ldrbcs r11, [r1, r11]
++ vdup.8 d3, r12
++ vext.8 q10, q10, q11, #8
++ it hi
++ ldrbhi r11, [r2, #-1]!
++ vmov d22, d23
++ vrshrn.u16 d23, q2, #5
++ itt cs
++ orrcs r8, r11, r8, lsl #8
++ vmovcs d1, r8, r9
++ vmull.u8 q2, d0, d2
++ subs r5, #1
++ vmlal.u8 q2, d1, d3
++ bne 1b
++
++ b store_tran_8x8_8
++
++
++.macro ADRT reg, val
++@ adr in T32 has enough range but not in A32
++A adrl \reg, \val
++T adr \reg, \val
++.endm
++
++@ ff_hevc_rpi_pred_angular_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ ldr lr, [r2], #1 @ Top
++ rsb r12, r6, #32
++ vmov s0, lr
++ vdup.8 d3, r6
++ ldr lr, [r2], #1
++ vdup.8 d2, r12
++ vmov s2, lr
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ itt mi
++ vmovmi s0, lr
++ ldrmi lr, [r2], #1
++ vdup.8 d2, r12
++ it mi
++ vmovmi s2, lr
++ vdup.8 d3, r6
++ mov r5, #2
++1:
++ vrshrn.u16 d20, q2, #5
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ vext.64 q8, q8, q9, #1
++ it mi
++ vmovmi s0, lr
++ vext.64 q9, q9, q10, #1
++ it mi
++ ldrmi lr, [r2], #1
++ vdup.8 d2, r12
++ it mi
++ vmovmi s2, lr
++ subs r5, #1
++ vdup.8 d3, r6
++ bne 1b
++
++ vrshrn.u16 d20, q2, #5
++ vmull.u8 q2, d0, d2
++ add r12, r0, r3
++ vmlal.u8 q2, d1, d3
++ lsl r3, #1
++ vext.64 q8, q8, q9, #1
++ vext.64 q9, q9, q10, #1
++ vrshrn.u16 d20, q2, #5
++
++98:
++ vst4.8 {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
++ vst4.8 {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
++ vst4.8 {d17[2], d18[2], d19[2], d20[2]}, [r0]
++ vst4.8 {d17[3], d18[3], d19[3], d20[3]}, [r12]
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ rsb r12, r6, #32
++ ldr lr, [r2] @ Left
++ ldrb r2, [r2, #-1] @ Top-left
++ vmov s0, lr
++ vdup.8 d2, r12
++ vdup.8 d3, r6
++ orr lr, r2, lr, lsl #8
++ vmov s2, lr
++ sub r8, r7, #128
++ mov r5, #3
++2:
++ vmull.u8 q2, d0, d2
++ subs r12, r4
++ vmlal.u8 q2, d1, d3
++T it mi
++ addmi r12, #32
++T asr r6, r8, #8
++T it mi
++T ldrbmi r2, [r1, r6]
++A ldrbmi r2, [r1, r8, asr #8]
++ rsb r6, r12, #32
++ vdup.8 d2, r12
++ ittt mi
++ vmovmi s0, lr
++ orrmi lr, r2, lr, lsl #8
++ vmovmi s2, lr
++ vrshrn.u16 d20, q2, #5
++ vdup.8 d3, r6
++ it mi
++ addmi r8, r7
++ subs r5, #1
++ vext.64 q8, q8, q9, #1
++ vext.64 q9, q9, q10, #1
++ bne 2b
++
++ vmull.u8 q2, d0, d2
++ add r12, r0, r3
++ vmlal.u8 q2, d1, d3
++ lsl r3, #1
++ vrshrn.u16 d20, q2, #5
++ b 98b
++
++@ Left of vertical - works down left
++18:
++ ldrh r7, [r7]
++ rsb r12, r6, #32
++ ldr lr, [r1] @ Top
++ ldrb r1, [r2, #-1] @ Top-left
++ vmov s0, lr
++ vdup.8 d2, r12
++ vdup.8 d3, r6
++ orr lr, r1, lr, lsl #8
++ vmov s2, lr
++ sub r8, r7, #128
++ mov r5, #3
++2:
++ vmull.u8 q2, d0, d2
++ subs r12, r4
++ vmlal.u8 q2, d1, d3
++T it mi
++ addmi r12, #32
++T asr r6, r8, #8
++T it mi
++T ldrbmi r1, [r2, r6]
++A ldrbmi r1, [r2, r8, asr #8]
++ rsb r6, r12, #32
++ vdup.8 d2, r12
++ ittt mi
++ vmovmi s0, lr
++ orrmi lr, r1, lr, lsl #8
++ vmovmi s2, lr
++ vrshrn.u16 d4, q2, #5
++ vdup.8 d3, r6
++ it mi
++ addmi r8, r7
++ subs r5, #1
++ vst1.32 {d4[0]}, [r0], r3
++ bne 2b
++
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d4, q2, #5
++ vst1.32 {d4[0]}, [r0]
++
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ ldr lr, [r1], #1 @ Top
++ rsb r12, r6, #32
++ vmov s0, lr
++ vdup.8 d3, r6
++ ldr lr, [r1], #1
++ vdup.8 d2, r12
++ vmov s2, lr
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ itt mi
++ vmovmi s0, lr
++ ldrmi lr, [r1], #1
++ vdup.8 d2, r12
++ it mi
++ vmovmi s2, lr
++ vdup.8 d3, r6
++ mov r5, #2
++1:
++ vrshrn.u16 d6, q2, #5
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ vst1.32 {d6[0]}, [r0], r3
++ itt mi
++ vmovmi s0, lr
++ ldrmi lr, [r1], #1
++ vdup.8 d2, r12
++ it mi
++ vmovmi s2, lr
++ subs r5, #1
++ vdup.8 d3, r6
++ bne 1b
++
++ vrshrn.u16 d6, q2, #5
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vst1.32 {d6[0]}, [r0], r3
++ vrshrn.u16 d6, q2, #5
++ vst1.32 {d6[0]}, [r0]
++
++ pop {r4-r8, pc}
++
++endfunc
++
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ bl patch_h_down_8x8_8
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ bl patch_h_up_8x8_8
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ ldrb lr, [r2, #-1] @ Top-left
++ ldrh r7, [r7]
++ vmov d0, r8, r9
++ lsl r9, r9, #8
++ vdup.8 d2, r12
++ orr r9, r9, r8, lsr #24
++ orr r8, lr, r8, lsl #8
++ vmov d1, r8, r9
++ sub r1, r7, #128
++ mov r5, #7
++1:
++ vdup.8 d3, r6
++ vmull.u8 q2, d0, d2
++ subs r12, r12, r4
++ vmlal.u8 q2, d1, d3
++ ittt mi
++ addmi lr, r2, r1, asr #8
++ addmi r12, r12, #32
++ vmovmi d0, r8, r9
++ rsb r6, r12, #32
++ itt mi
++ lslmi r9, r9, #8
++ ldrbmi lr, [lr]
++ vdup.8 d2, r12
++ vrshrn.u16 d4, q2, #5
++ itttt mi
++ orrmi r9, r9, r8, lsr #24
++ orrmi r8, lr, r8, lsl #8
++ vmovmi d1, r8, r9
++ addmi r1, r1, r7
++ subs r5, r5, #1
++ vst1.8 {d4}, [r0], r3
++ bne 1b
++
++ vdup.8 d3, r6
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d4, q2, #5
++ vst1.8 {d4}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r6
++ mov r5, #7
++ lsr r8, #8
++ vdup.8 d2, r12
++ orr r8, r8, r9, lsl #24
++ ldr r9, [r1, #5]!
++ vmov d1, r8, r9
++1:
++ vmull.u8 q2, d0, d2
++ subs r12, r4
++ vmlal.u8 q2, d1, d3
++ it mi
++ addmi r12, #32
++ rsb r6, r12, #32
++ itt mi
++ vmovmi d0, r8, r9
++ lsrmi r8, #8
++ vdup.8 d2, r12
++ itt mi
++ orrmi r8, r8, r9, lsl #24
++ ldrmi r9, [r1, #1]!
++ vrshrn.u16 d6, q2, #5
++ it mi
++ vmovmi d1, r8, r9
++ vdup.8 d3, r6
++ subs r5, #1
++ vst1.8 {d6}, [r0], r3
++ bne 1b
++
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d6, q2, #5
++ vst1.8 {d6}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8_continue
++
++ add r2, r1, #8 @ restore r2, but 8 rows further down left
++ sub r0, #16
++ mov r6, r4
++ add r0, r0, r3, lsl #3
++
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8_continue
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++
++ push {r2}
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8_continue
++ pop {r2}
++
++ sub r0, #16
++ mov r10, #-128
++ add r2, #8
++ add r0, r0, r3, lsl #3
++ sub r10, r10, r7, lsl #3
++
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8_continue
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.8 {q9}, [r1]
++ sub r1, r2, #1
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ vdup.8 d6, r6
++ vext.8 q8, q9, q9, #15
++ sub r8, r7, #128
++ vld1.8 {d16[0]}, [r1]
++ vdup.8 d7, r12
++ mov r5, #15
++1:
++ vmull.u8 q0, d18, d7
++ subs r12, r4
++ vmlal.u8 q0, d16, d6
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d19, d7
++ it cc
++ addcc r1, r2, r8, asr #8
++ vmlal.u8 q1, d17, d6
++ rsb r6, r12, #32
++ vext.8 q10, q8, q8, #15
++ sub r5, #1
++ vld1.8 {d20[0]}, [r1]
++ it cc
++ addcc r8, r7
++ vmov q11, q8
++ teq r5, #0
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmull.u8 q0, d22, d7
++ subs r12, r4
++ vmlal.u8 q0, d20, d6
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d23, d7
++ it cc
++ addcc r1, r2, r8, asr #8
++ vmlal.u8 q1, d21, d6
++ rsb r6, r12, #32
++ vext.8 q8, q10, q10, #15
++ sub r5, #1
++ vld1.8 {d16[0]}, [r1]
++ it cc
++ addcc r8, r7
++ vmov q9, q10
++ teq r5, #0
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmull.u8 q0, d22, d7
++ vmlal.u8 q0, d20, d6
++ vmull.u8 q1, d23, d7
++ vmlal.u8 q1, d21, d6
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmull.u8 q0, d18, d7
++ vmlal.u8 q0, d16, d6
++ vmull.u8 q1, d19, d7
++ vmlal.u8 q1, d17, d6
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q9}, [r1]!
++ rsb r12, r6, #32
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vext.8 q8, q9, q9, #1
++ vld1.8 {d17[7]}, [r1]!
++ mov r5, #15
++1:
++ vmull.u8 q0, d16, d6
++ subs r12, r4
++ vmlal.u8 q0, d18, d7
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d17, d6
++ rsb r6, r12, #32
++ vmlal.u8 q1, d19, d7
++ sub r5, #1
++ vext.8 q10, q8, q8, #1
++ teq r5, #0
++ vld1.8 {d21[7]}, [r1]
++ it cc
++ addcc r1, #1
++ vmov q11, q8
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmull.u8 q0, d20, d6
++ subs r12, r4
++ vmlal.u8 q0, d22, d7
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d21, d6
++ rsb r6, r12, #32
++ vmlal.u8 q1, d23, d7
++ sub r5, #1
++ vext.8 q8, q10, q10, #1
++ teq r5, #0
++ vld1.8 {d17[7]}, [r1]
++ it cc
++ addcc r1, #1
++ vmov q9, q10
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmull.u8 q0, d20, d6
++ vmlal.u8 q0, d22, d7
++ vmull.u8 q1, d21, d6
++ vmlal.u8 q1, d23, d7
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmull.u8 q0, d16, d6
++ vmlal.u8 q0, d18, d7
++ vmull.u8 q1, d17, d6
++ vmlal.u8 q1, d19, d7
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r10, #4
++ mov r1, r2
++1:
++ bl patch_h_down_8x8_8
++ bl patch_h_down_8x8_8_continue
++ bl patch_h_down_8x8_8_continue
++ bl patch_h_down_8x8_8_continue
++
++ add r2, r1, #8 @ restore r2, but 8 rows further down left
++ add r1, r1, #8
++ mov r6, r4
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #3
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ vmov.i8 d6, #1<<2
++1:
++ push {r2,r10}
++ bl patch_h_up_8x8_8
++ bl patch_h_up_8x8_8_continue
++ bl patch_h_up_8x8_8_continue
++ bl patch_h_up_8x8_8_continue
++ pop {r2,r10}
++
++ vmov r8, s12
++ sub r0, #32
++ add r2, #8
++ add r0, r0, r3, lsl #3
++ sub r10, r10, r7, lsl #3
++ vshr.u8 d6, #1
++ teq r8, #0
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.8 {q0-q1}, [r1]
++ sub r9, r2, #1
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ mov r8, #-128
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ mov r5, #32
++1:
++ vld1.8 {d17[7]}, [r9]
++ add r8, r7
++ vmov q2, q0
++ vmov q3, q1
++ add r9, r2, r8, asr #8
++ vext.8 q1, q0, q1, #15
++ vext.8 q0, q8, q0, #15
++2:
++ vmull.u8 q10, d4, d19
++ subs r12, r4
++ vmlal.u8 q10, d0, d18
++ it cc
++ addcc r12, #32
++ vmull.u8 q11, d5, d19
++ rsb r6, r12, #32
++ vmlal.u8 q11, d1, d18
++ sub r5, #1
++ vmull.u8 q12, d6, d19
++ teq r5, #0
++ vmlal.u8 q12, d2, d18
++ vmull.u8 q13, d7, d19
++ vmlal.u8 q13, d3, d18
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vrshrn.u16 d22, q12, #5
++ vrshrn.u16 d23, q13, #5
++ vst1.8 {q10-q11}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.8 {q0-q1}, [r1]!
++ rsb r12, r6, #32
++ vld1.8 {d16[0]}, [r5]
++ mov r5, #32
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++1:
++ vmov q2, q0
++ add r1, #1
++ vmov q3, q1
++ vext.8 q0, q0, q1, #1
++ vext.8 q1, q1, q8, #1
++2:
++ vmull.u8 q10, d0, d18
++ subs r12, r4
++ vmlal.u8 q10, d4, d19
++ it cc
++ addcc r12, #32
++ vmull.u8 q11, d1, d18
++ rsb r6, r12, #32
++ vmlal.u8 q11, d5, d19
++ sub r5, #1
++ vmull.u8 q12, d2, d18
++ teq r5, #0
++ vmlal.u8 q12, d6, d19
++ vmull.u8 q13, d3, d18
++ vmlal.u8 q13, d7, d19
++ vld1.8 {d16[0]}, [r1]
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vrshrn.u16 d22, q12, #5
++ vrshrn.u16 d23, q13, #5
++ vst1.8 {q10-q11}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ Chroma 8 bit 4x4 patch fns
++ .text
++
++patch_h_down_c_4x4_8:
++ ldrd r8, r9, [r2] @ Left
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r6
++ lsr r8, #16
++ vdup.8 d2, r12
++ orr r8, r8, r9, lsl #16
++ ldr r9, [r2, #6]!
++ vmov d1, r8, r9
++ // drop through...
++patch_h_down_c_4x4_8_continue:
++ mov r5, #4
++1:
++ subs r12, r4
++ vmull.u8 q2, d0, d2
++ it mi
++ addmi r12, #32
++ vmlal.u8 q2, d1, d3
++ rsb r6, r12, #32
++ vext.8 q8, q8, q9, #8
++ it mi
++ lsrmi r7, r8, #16
++ vmov d18, d19
++ it mi
++ vmovmi d0, r8, r9
++ vdup.8 d2, r12
++ it mi
++ orrmi r8, r7, r9, lsl #16
++ vrshrn.u16 d19, q2, #5
++ itt mi
++ ldrmi r9, [r2, #2]!
++ vmovmi d1, r8, r9
++ subs r5, #1
++ vdup.8 d3, r6
++ bne 1b
++ // drop through...
++store_tran_c_4x4_8:
++ vzip.16 d16, d17
++ add r6, r0, r3
++ vzip.16 d18, d19
++ lsl r3, #1
++ vzip.32 q8, q9
++ add r5, r0, r3
++ vst1.16 {d16}, [r0]!
++ vst1.16 {d17}, [r6], r3
++ vst1.16 {d18}, [r5]
++ asr r3, #1
++ vst1.16 {d19}, [r6]
++
++ bx lr
++
++patch_h_up_c_4x4_8:
++ ldrd r8, r9, [r2]
++ rsb r6, r4, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r4
++ lsr r11, r8, #16
++ vdup.8 d2, r6
++ ldr r8, [r2, #-2]!
++ orr r9, r11, r9, lsl #16
++ vmov d1, r8, r9
++ mov r12, r4
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++patch_h_up_c_4x4_8_continue:
++ mov r5, #4
++1:
++ add r12, r4
++ cmp r12, #33
++ it cs
++ addcs r10, r7
++ mov r11, #0
++ itt cs
++ subcs r12, #32
++ tstcs r10, #1<<31
++ rsb r6, r12, #32
++ it eq
++ asreq r11, r10, #7
++ it cs
++ vmovcs d0, r8, r9
++ it eq
++ biceq r11, #1
++ vdup.8 d2, r6
++ it cs
++ lsrcs r6, r8, #16
++ vdup.8 d3, r12
++ vext.8 q8, q8, q9, #8
++ itt cs
++ orrcs r9, r6, r9, lsl #16
++ ldrhcs r11, [r1, r11]
++ vmov d18, d19
++ it hi
++ ldrhhi r11, [r2, #-2]!
++ vrshrn.u16 d19, q2, #5
++ itt cs
++ orrcs r8, r11, r8, lsl #16
++ vmovcs d1, r8, r9
++ vmull.u8 q2, d0, d2
++ subs r5, #1
++ vmlal.u8 q2, d1, d3
++ bne 1b
++
++ b store_tran_c_4x4_8
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ bl patch_h_down_c_4x4_8
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ bl patch_h_up_c_4x4_8
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ ldrh lr, [r2, #-2] @ Top-left
++ ldrh r7, [r7]
++ vmov d0, r8, r9
++ lsl r9, r9, #16
++ vdup.8 d2, r12
++ orr r9, r9, r8, lsr #16
++ orr r8, lr, r8, lsl #16
++ vmov d1, r8, r9
++ sub r1, r7, #128
++ mov r5, #3
++1:
++ vdup.8 d3, r6
++ vmull.u8 q2, d0, d2
++ subs r12, r12, r4
++ vmlal.u8 q2, d1, d3
++ itttt mi
++ addmi lr, r2, r1, asr #7
++ bicmi lr, #1
++ addmi r12, r12, #32
++ vmovmi d0, r8, r9
++ rsb r6, r12, #32
++ itt mi
++ lslmi r9, r9, #16
++ ldrhmi lr, [lr]
++ vdup.8 d2, r12
++ vrshrn.u16 d4, q2, #5
++ itttt mi
++ orrmi r9, r9, r8, lsr #16
++ orrmi r8, lr, r8, lsl #16
++ vmovmi d1, r8, r9
++ addmi r1, r1, r7
++ subs r5, r5, #1
++ vst1.16 {d4}, [r0], r3
++ bne 1b
++
++ vdup.8 d3, r6
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d4, q2, #5
++ vst1.16 {d4}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.8 d3, r6
++ mov r5, #3
++ lsr r8, #16
++ vdup.8 d2, r12
++ orr r8, r8, r9, lsl #16
++ ldr r9, [r1, #6]!
++ vmov d1, r8, r9
++1:
++ vmull.u8 q2, d0, d2
++ subs r12, r4
++ vmlal.u8 q2, d1, d3
++ it mi
++ addmi r12, #32
++ rsb r6, r12, #32
++ itt mi
++ vmovmi d0, r8, r9
++ lsrmi r8, #16
++ vdup.8 d2, r12
++ itt mi
++ orrmi r8, r8, r9, lsl #16
++ ldrmi r9, [r1, #2]!
++ vrshrn.u16 d6, q2, #5
++ it mi
++ vmovmi d1, r8, r9
++ vdup.8 d3, r6
++ subs r5, #1
++ vst1.16 {d6}, [r0], r3
++ bne 1b
++
++ vmull.u8 q2, d0, d2
++ vmlal.u8 q2, d1, d3
++ vrshrn.u16 d6, q2, #5
++ vst1.16 {d6}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ sub r0, #16
++ mov r6, r4
++ add r0, r0, r3, lsl #2
++
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8_continue
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++
++ push {r2}
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8_continue
++ pop {r2}
++
++ sub r0, #16
++ mov r10, #-128
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8_continue
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.8 {q9}, [r1]
++ sub r1, r2, #2
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ vdup.8 d6, r6
++ vext.8 q8, q9, q9, #14
++ sub r8, r7, #128
++ vld1.16 {d16[0]}, [r1]
++ vdup.8 d7, r12
++ mov r5, #7
++1:
++ subs r12, r4
++ vmull.u8 q0, d18, d7
++ it cc
++ asrcc r1, r8, #8
++ vmlal.u8 q0, d16, d6
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d19, d7
++ it cc
++ addcc r1, r2, r1, lsl #1
++ vmlal.u8 q1, d17, d6
++ rsb r6, r12, #32
++ vext.8 q10, q8, q8, #14
++ sub r5, #1
++ vld1.16 {d20[0]}, [r1]
++ it cc
++ addcc r8, r7
++ vmov q11, q8
++ teq r5, #0
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ subs r12, r4
++ vmull.u8 q0, d22, d7
++ it cc
++ asrcc r1, r8, #8
++ vmlal.u8 q0, d20, d6
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d23, d7
++ it cc
++ addcc r1, r2, r1, lsl #1
++ vmlal.u8 q1, d21, d6
++ rsb r6, r12, #32
++ vext.8 q8, q10, q10, #14
++ sub r5, #1
++ vld1.16 {d16[0]}, [r1]
++ it cc
++ addcc r8, r7
++ vmov q9, q10
++ teq r5, #0
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmull.u8 q0, d22, d7
++ vmlal.u8 q0, d20, d6
++ vmull.u8 q1, d23, d7
++ vmlal.u8 q1, d21, d6
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmull.u8 q0, d18, d7
++ vmlal.u8 q0, d16, d6
++ vmull.u8 q1, d19, d7
++ vmlal.u8 q1, d17, d6
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.8 {q9}, [r1]!
++ rsb r12, r6, #32
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vext.8 q8, q9, q9, #2
++ vld1.16 {d17[3]}, [r1]!
++ mov r5, #7
++1:
++ vmull.u8 q0, d16, d6
++ subs r12, r4
++ vmlal.u8 q0, d18, d7
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d17, d6
++ rsb r6, r12, #32
++ vmlal.u8 q1, d19, d7
++ sub r5, #1
++ vext.8 q10, q8, q8, #2
++ teq r5, #0
++ vld1.16 {d21[3]}, [r1]
++ it cc
++ addcc r1, #2
++ vmov q11, q8
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmull.u8 q0, d20, d6
++ subs r12, r4
++ vmlal.u8 q0, d22, d7
++ it cc
++ addcc r12, #32
++ vmull.u8 q1, d21, d6
++ rsb r6, r12, #32
++ vmlal.u8 q1, d23, d7
++ sub r5, #1
++ vext.8 q8, q10, q10, #2
++ teq r5, #0
++ vld1.16 {d17[3]}, [r1]
++ it cc
++ addcc r1, #2
++ vmov q9, q10
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vdup.8 d6, r6
++ vdup.8 d7, r12
++ vst1.8 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmull.u8 q0, d20, d6
++ vmlal.u8 q0, d22, d7
++ vmull.u8 q1, d21, d6
++ vmlal.u8 q1, d23, d7
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmull.u8 q0, d16, d6
++ vmlal.u8 q0, d18, d7
++ vmull.u8 q1, d17, d6
++ vmlal.u8 q1, d19, d7
++ vrshrn.u16 d0, q0, #5
++ vrshrn.u16 d1, q1, #5
++ vst1.8 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r10, #4
++ mov r1, r2
++1:
++ bl patch_h_down_c_4x4_8
++ bl patch_h_down_c_4x4_8_continue
++ bl patch_h_down_c_4x4_8_continue
++ bl patch_h_down_c_4x4_8_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ add r1, r1, #4*2
++ mov r6, r4
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ vmov.i8 d6, #1<<2
++1:
++ push {r2, r10}
++ bl patch_h_up_c_4x4_8
++ bl patch_h_up_c_4x4_8_continue
++ bl patch_h_up_c_4x4_8_continue
++ bl patch_h_up_c_4x4_8_continue
++ pop {r2, r10}
++
++ vmov r8, s12
++ sub r0, #32
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++ vshr.u8 d6, #1
++ teq r8, #0
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.8 {q0-q1}, [r1]
++ sub r9, r2, #2
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ mov r8, #-128
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ mov r5, #16
++1:
++ vld1.16 {d17[3]}, [r9]
++ add r8, r7
++ vmov q2, q0
++ vmov q3, q1
++ asr r9, r8, #8
++ vext.8 q1, q0, q1, #14
++ add r9, r2, r9, lsl #1
++ vext.8 q0, q8, q0, #14
++2:
++ vmull.u8 q10, d4, d19
++ subs r12, r4
++ vmlal.u8 q10, d0, d18
++ it cc
++ addcc r12, #32
++ vmull.u8 q11, d5, d19
++ rsb r6, r12, #32
++ vmlal.u8 q11, d1, d18
++ sub r5, #1
++ vmull.u8 q12, d6, d19
++ teq r5, #0
++ vmlal.u8 q12, d2, d18
++ vmull.u8 q13, d7, d19
++ vmlal.u8 q13, d3, d18
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vrshrn.u16 d22, q12, #5
++ vrshrn.u16 d23, q13, #5
++ vst1.8 {q10-q11}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.8 {q0-q1}, [r1]!
++ rsb r12, r6, #32
++ vld1.16 {d16[0]}, [r5]
++ mov r5, #16
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++1:
++ vmov q2, q0
++ add r1, #2
++ vmov q3, q1
++ vext.8 q0, q0, q1, #2
++ vext.8 q1, q1, q8, #2
++2:
++ vmull.u8 q10, d0, d18
++ subs r12, r4
++ vmlal.u8 q10, d4, d19
++ it cc
++ addcc r12, #32
++ vmull.u8 q11, d1, d18
++ rsb r6, r12, #32
++ vmlal.u8 q11, d5, d19
++ sub r5, #1
++ vmull.u8 q12, d2, d18
++ teq r5, #0
++ vmlal.u8 q12, d6, d19
++ vmull.u8 q13, d3, d18
++ vmlal.u8 q13, d7, d19
++ vld1.16 {d16[0]}, [r1]
++ vdup.8 d18, r6
++ vdup.8 d19, r12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vrshrn.u16 d22, q12, #5
++ vrshrn.u16 d23, q13, #5
++ vst1.8 {q10-q11}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++endfunc
++
++@------------------------------------------------------------------------------
++@ Data
++
++ .text
++ .balign 64
++angle_2:
++ .byte 32
++ .byte 26, 21, 17, 13, 9, 5, 2, 0
++ @ Sign inverted from standards table
++ .byte 2, 5, 9, 13, 17, 21, 26, 32
++ .byte 26, 21, 17, 13, 9, 5, 2, 0
++ @ Standard sign
++ .byte 2, 5, 9, 13, 17, 21, 26, 32
++
++ .balign 2
++
++ @ Sign inverted from standards table
++inv_angle:
++ .short 4096, 1638, 910, 630, 482, 390, 315
++ .short 256
++ .short 315, 390, 482, 630, 910, 1638, 4096
++
++@------------------------------------------------------------------------------
++@
++@ 10 bit fns
++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
++@ but runs out of register width for 12+ bit
++
++ .text
++ .balign 64
++
++patch_h_down_4x4_10:
++ ldrd r8, r9, [r2] @ Left
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.16 d3, r6
++ lsr r8, #16
++ vdup.16 d2, r12
++ orr r8, r8, r9, lsl #16
++ ldr r9, [r2, #6]!
++ vmov d1, r8, r9
++ // drop through...
++patch_h_down_4x4_10_continue:
++ mov r5, #4
++1:
++ subs r12, r4
++ vmul.u16 d4, d0, d2
++ it mi
++ addmi r12, #32
++ vmla.u16 d4, d1, d3
++ rsb r6, r12, #32
++ vext.16 q8, q8, q9, #4
++ it mi
++ lsrmi r7, r8, #16
++ vmov d18, d19
++ it mi
++ vmovmi d0, r8, r9
++ vdup.16 d2, r12
++ it mi
++ orrmi r8, r7, r9, lsl #16
++ vrshr.u16 d19, d4, #5
++ itt mi
++ ldrmi r9, [r2, #2]!
++ vmovmi d1, r8, r9
++ subs r5, #1
++ vdup.16 d3, r6
++ bne 1b
++ // drop through...
++store_tran_4x4_10:
++ vzip.16 d16, d17
++ add r6, r0, r3
++ vzip.16 d18, d19
++ lsl r3, #1
++ vzip.32 q8, q9
++ add r5, r0, r3
++ vst1.16 {d16}, [r0]!
++ vst1.16 {d17}, [r6], r3
++ vst1.16 {d18}, [r5]
++ asr r3, #1
++ vst1.16 {d19}, [r6]
++
++ bx lr
++
++patch_h_up_4x4_10:
++ ldrd r8, r9, [r2]
++ rsb r6, r4, #32
++ vmov d0, r8, r9
++ vdup.16 d3, r4
++ lsr r11, r8, #16
++ vdup.16 d2, r6
++ ldr r8, [r2, #-2]!
++ orr r9, r11, r9, lsl #16
++ vmov d1, r8, r9
++ mov r12, r4
++ vmul.u16 d4, d0, d2
++ vmla.u16 d4, d1, d3
++patch_h_up_4x4_10_continue:
++ mov r5, #4
++1:
++ add r12, r4
++ cmp r12, #33
++ it cs
++ addcs r10, r7
++ mov r11, #0
++ itt cs
++ subcs r12, #32
++ tstcs r10, #1<<31
++ rsb r6, r12, #32
++ it eq
++ asreq r11, r10, #7
++ it cs
++ vmovcs d0, r8, r9
++ it eq
++ biceq r11, #1
++ vdup.16 d2, r6
++ it cs
++ lsrcs r6, r8, #16
++ vdup.16 d3, r12
++ vext.16 q8, q8, q9, #4
++ itt cs
++ orrcs r9, r6, r9, lsl #16
++ ldrhcs r11, [r1, r11]
++ vmov d18, d19
++ it hi
++ ldrhhi r11, [r2, #-2]!
++ vrshr.u16 d19, d4, #5
++ itt cs
++ orrcs r8, r11, r8, lsl #16
++ vmovcs d1, r8, r9
++ vmul.u16 d4, d0, d2
++ subs r5, #1
++ vmla.u16 d4, d1, d3
++ bne 1b
++
++ b store_tran_4x4_10
++
++
++@ ff_hevc_rpi_pred_angular_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ bl patch_h_down_4x4_10
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ bl patch_h_up_4x4_10
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ ldrh lr, [r2, #-2] @ Top-left
++ ldrh r7, [r7]
++ vmov d0, r8, r9
++ lsl r9, r9, #16
++ vdup.16 d2, r12
++ orr r9, r9, r8, lsr #16
++ orr r8, lr, r8, lsl #16
++ vmov d1, r8, r9
++ sub r1, r7, #128
++ mov r5, #3
++1:
++ sel lr, lr, lr @ force pipeline 0 on Cortex-A53
++ vdup.16 d3, r6
++ vmul.u16 d4, d0, d2
++ subs r12, r12, r4
++ vmla.u16 d4, d1, d3
++ itttt mi
++ addmi lr, r2, r1, asr #7
++ bicmi lr, #1
++ addmi r12, r12, #32
++ vmovmi d0, r8, r9
++ rsb r6, r12, #32
++ itt mi
++ lslmi r9, r9, #16
++ ldrhmi lr, [lr]
++ vdup.16 d2, r12
++ vrshr.u16 d4, d4, #5
++ itttt mi
++ orrmi r9, r9, r8, lsr #16
++ orrmi r8, lr, r8, lsl #16
++ vmovmi d1, r8, r9
++ addmi r1, r1, r7
++ subs r5, r5, #1
++ vst1.16 {d4}, [r0], r3
++ bne 1b
++
++ vdup.16 d3, r6
++ nop @ force next insn into pipeline 0 to enable
++ vmul.u16 d4, d0, d2 @ vmla to execute back-to-back on Cortex-A53
++ vmla.u16 d4, d1, d3
++ vrshr.u16 d4, d4, #5
++ vst1.16 {d4}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ ldrd r8, r9, [r1] @ Top
++ rsb r12, r6, #32
++ vmov d0, r8, r9
++ vdup.16 d3, r6
++ lsr r8, #16
++ vdup.16 d2, r12
++ orr r8, r8, r9, lsl #16
++ ldr r9, [r1, #6]!
++ vmov d1, r8, r9
++ mov r5, #3
++1:
++ vmul.u16 d4, d0, d2
++ subs r12, r4
++ vmla.u16 d4, d1, d3
++ it mi
++ addmi r12, #32
++ rsb r6, r12, #32
++ itt mi
++ vmovmi d0, r8, r9
++ lsrmi r8, #16
++ vdup.16 d2, r12
++ itt mi
++ orrmi r8, r8, r9, lsl #16
++ ldrmi r9, [r1, #2]!
++ vrshr.u16 d4, d4, #5
++ it mi
++ vmovmi d1, r8, r9
++ vdup.16 d3, r6
++ subs r5, #1
++ vst1.16 {d4}, [r0], r3
++ bne 1b
++
++ vmul.u16 d4, d0, d2
++ vmla.u16 d4, d1, d3
++ vrshr.u16 d4, d4, #5
++ vst1.16 {d4}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ sub r0, #16
++ mov r6, r4
++ add r0, r0, r3, lsl #2
++
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10_continue
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++
++ push {r2}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10_continue
++ pop {r2}
++
++ sub r0, #16
++ mov r10, #-128
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10_continue
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.16 {q9}, [r1]
++ sub r1, r2, #2
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ vdup.16 q2, r6
++ vext.16 q8, q9, q9, #7
++ sub r8, r7, #128
++ vld1.16 {d16[0]}, [r1]
++ vdup.16 q3, r12
++ mov r5, #7
++1:
++ vmul.u16 q0, q9, q3
++ subs r12, r4
++ vmla.u16 q0, q8, q2
++ ittt cc
++ asrcc r1, r8, #8
++ addcc r12, #32
++ addcc r1, r2, r1, lsl #1
++ vext.16 q10, q8, q8, #7
++ rsb r6, r12, #32
++ vmov q11, q8
++ sub r5, #1
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r8, r7
++ vld1.16 {d20[0]}, [r1]
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmul.u16 q0, q11, q3
++ subs r12, r4
++ vmla.u16 q0, q10, q2
++ ittt cc
++ asrcc r1, r8, #8
++ addcc r12, #32
++ addcc r1, r2, r1, lsl #1
++ vext.16 q8, q10, q10, #7
++ rsb r6, r12, #32
++ vmov q9, q10
++ sub r5, #1
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r8, r7
++ vld1.16 {d16[0]}, [r1]
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmul.u16 q0, q11, q3
++ vmla.u16 q0, q10, q2
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmul.u16 q0, q9, q3
++ vmla.u16 q0, q8, q2
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q9}, [r1]!
++ rsb r12, r6, #32
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vext.16 q8, q9, q9, #1
++ vld1.16 {d17[3]}, [r1]!
++ mov r5, #7
++1:
++ vmul.u16 q0, q8, q2
++ subs r12, r4
++ vmla.u16 q0, q9, q3
++ it cc
++ addcc r12, #32
++ vext.16 q10, q8, q8, #1
++ rsb r6, r12, #32
++ vld1.16 {d21[3]}, [r1]
++ sub r5, #1
++ vmov q11, q8
++ teq r5, #0
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r1, #2
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmul.u16 q0, q10, q2
++ subs r12, r4
++ vmla.u16 q0, q11, q3
++ it cc
++ addcc r12, #32
++ vext.16 q8, q10, q10, #1
++ rsb r6, r12, #32
++ vld1.16 {d17[3]}, [r1]
++ sub r5, #1
++ vmov q9, q10
++ teq r5, #0
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r1, #2
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmul.u16 q0, q10, q2
++ vmla.u16 q0, q11, q3
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r11, pc}
++4:
++ bcc 3b
++5:
++ vmul.u16 q0, q8, q2
++ vmla.u16 q0, q9, q3
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r10, #4
++ mov r1, r2
++1:
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ add r1, r1, #4*2
++ mov r6, r4
++ sub r0, #32
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r10, #-128
++ vmov.i8 d6, #1<<2
++1:
++ push {r2, r10}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ pop {r2, r10}
++
++ vmov r8, s12
++ sub r0, #32
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++ vshr.u8 d6, #1
++ teq r8, #0
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.16 {q0-q1}, [r1]
++ sub r9, r2, #2
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ mov r8, #-128
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ mov r5, #16
++1:
++ vld1.16 {d17[3]}, [r9]
++ add r8, r7
++ vmov q2, q0
++ vmov q3, q1
++ asr r9, r8, #8
++ vext.16 q1, q0, q1, #7
++ add r9, r2, r9, lsl #1
++ vext.16 q0, q8, q0, #7
++2:
++ vmul.u16 q11, q2, q10
++ subs r12, r4
++ vmla.u16 q11, q0, q9
++ it cc
++ addcc r12, #32
++ vmul.u16 q12, q3, q10
++ rsb r6, r12, #32
++ vmla.u16 q12, q1, q9
++ sub r5, #1
++ teq r5, #0
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ vrshr.u16 q11, q11, #5
++ vrshr.u16 q12, q12, #5
++ vst1.16 {q11-q12}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.16 {q0-q1}, [r1]!
++ rsb r12, r6, #32
++ vld1.16 {d16[0]}, [r5]
++ mov r5, #16
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++1:
++ vmov q2, q0
++ add r1, #2
++ vmov q3, q1
++ vext.16 q0, q0, q1, #1
++ vext.16 q1, q1, q8, #1
++2:
++ vmul.u16 q11, q0, q9
++ subs r12, r4
++ vmla.u16 q11, q2, q10
++ it cc
++ addcc r12, #32
++ vmul.u16 q12, q1, q9
++ rsb r6, r12, #32
++ vmla.u16 q12, q3, q10
++ sub r5, #1
++ vld1.16 {d16[0]}, [r1]
++ teq r5, #0
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ vrshr.u16 q11, q11, #5
++ vrshr.u16 q12, q12, #5
++ vst1.16 {q11-q12}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r11, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #1
++ vpush {d8}
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ add sp, #8
++ mov r10, #8
++ mov r1, r2
++1:
++ bl patch_h_down_4x4_10
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++ bl patch_h_down_4x4_10_continue
++
++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
++ add r1, r1, #4*2
++ mov r6, r4
++ sub r0, #64
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++ add sp, #8
++ ldrh r7, [r7]
++ mov r10, #-128
++ vmov.i8 d6, #1<<6
++1:
++ push {r2, r10}
++ bl patch_h_up_4x4_10
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ bl patch_h_up_4x4_10_continue
++ pop {r2, r10}
++
++ vmov r8, s12
++ sub r0, #64
++ add r2, #8
++ add r0, r0, r3, lsl #2
++ sub r10, r10, r7, lsl #2
++ vshr.u8 d6, #1
++ teq r8, #0
++ bne 1b
++
++ pop {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++ add r5, r1, #32
++ vld1.16 {q1-q2}, [r1]
++ rsb r12, r6, r6, lsl #16
++ vld1.16 {q3-q4}, [r5]
++ sub r9, r2, #2
++ rsb r4, r12, #0
++ rsb r12, r12, #32 << 16
++ ldrh r7, [r7]
++ mov r8, #-128
++ vmov d0, d9
++ vmov s2, r12
++ add r10, r0, #32
++ mov r5, #32
++1:
++ vld1.16 {d1[3]}, [r9]
++ add r8, r7
++ vmov q11, q4
++ vmov q10, q3
++ asr r9, r8, #8
++ vmov q9, q2
++ add r9, r2, r9, lsl #1
++ vmov q8, q1
++ vext.16 q4, q3, q4, #7
++ vext.16 q3, q2, q3, #7
++ vext.16 q2, q1, q2, #7
++ vext.16 q1, q0, q1, #7
++2:
++ vmul.u16 q12, q8, d1[1]
++ adds r12, r4
++ vmla.u16 q12, q1, d1[0]
++ it cc
++ addcc r12, #32 << 16
++ vmul.u16 q13, q9, d1[1]
++ it cc
++ subcc r12, #32
++ vmla.u16 q13, q2, d1[0]
++ sub r5, #1
++ vmul.u16 q14, q10, d1[1]
++ teq r5, #0
++ vmla.u16 q14, q3, d1[0]
++ vmul.u16 q15, q11, d1[1]
++ vmla.u16 q15, q4, d1[0]
++ vmov s2, r12
++ vrshr.u16 q12, q12, #5
++ vrshr.u16 q13, q13, #5
++ vrshr.u16 q14, q14, #5
++ vrshr.u16 q15, q15, #5
++ vst1.16 {q12-q13}, [r0], r3
++ vst1.16 {q14-q15}, [r10], r3
++ bhi 2b
++ bne 1b
++
++ vpop {d8}
++ vmov d9, d0
++ pop {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.16 {q1-q2}, [r1]
++ rsb r12, r6, r6, lsl #16
++ vld1.16 {q3-q4}, [r5]
++ add r1, r1, #64
++ rsb r4, r12, #0
++ rsb r12, r12, #32 << 16
++ vmov d1, d9
++ vmov s1, r12
++ add r10, r0, #32
++ mov r5, #32
++1:
++ vld1.16 {d0[0]}, [r1]!
++ vmov q8, q1
++ vmov q9, q2
++ vmov q10, q3
++ vmov q11, q4
++ vext.16 q1, q1, q2, #1
++ vext.16 q2, q2, q3, #1
++ vext.16 q3, q3, q4, #1
++ vext.16 q4, q4, q0, #1
++2:
++ vmul.u16 q12, q1, d0[2]
++ adds r12, r4
++ vmla.u16 q12, q8, d0[3]
++ it cc
++ addcc r12, #32 << 16
++ vmul.u16 q13, q2, d0[2]
++ it cc
++ subcc r12, #32
++ vmla.u16 q13, q9, d0[3]
++ sub r5, #1
++ vmul.u16 q14, q3, d0[2]
++ teq r5, #0
++ vmla.u16 q14, q10, d0[3]
++ vmul.u16 q15, q4, d0[2]
++ vmla.u16 q15, q11, d0[3]
++ vmov s1, r12
++ vrshr.u16 q12, q12, #5
++ vrshr.u16 q13, q13, #5
++ vrshr.u16 q14, q14, #5
++ vrshr.u16 q15, q15, #5
++ vst1.16 {q12-q13}, [r0], r3
++ vst1.16 {q14-q15}, [r10], r3
++ bhi 2b
++ bne 1b
++
++ vpop {d8}
++ vmov d9, d1
++ pop {r4-r11, pc}
++
++endfunc
++
++
++
++@ Generate 4x4 chroma patch
++@
++@ In (const)
++@ r1 Up ptr (_up only)
++@ r3 Out stride
++@ r4 Angle add
++@ r7 Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2 Left ptr - updated
++@ r6 Angle frac (init to r4 + 32)
++@ r8 Inv angle accumulator
++@ q2 Cur Line - load before 1st call for down - set by _up
++@ q8 Cur Line - load before 1st call for up - set by _down
++@
++@ Temps
++@ r5 Loop counter
++@ r12
++@ d0, q1, q12-q15
++
++patch_h_down_c_4x4_10:
++ vld1.16 {q12}, [r2]!
++ rsb r12, r6, #32
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ mov r5, #4
++1:
++ vmov q13, q12
++ vext.16 q12, q12, q12, #2
++ vld1.32 {d25[1]}, [r2]!
++patch_h_down_c_4x4_10_continue:
++2:
++ vmov q8, q9
++ subs r12, r4
++ vmul.u16 q0, q13, q3
++ it cc
++ addcc r12, #32
++ vmla.u16 q0, q12, q2
++ rsb r6, r12, #32
++ vmov q9, q10
++ sub r5, #1
++ vmov q10, q11
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vrshr.u16 q11, q0, #5
++ bhi 2b
++ bne 1b
++
++ bcs 3f
++ vmov q13, q12
++ vext.16 q12, q12, q12, #2
++ vld1.32 {d25[1]}, [r2]!
++3:
++
++store_tran_c_4x4_10:
++T add r6, r0, r3
++ vzip.32 q8, q10
++A add r6, r0, r3
++T lsl r3, #1
++ vzip.32 q9, q11
++A add r5, r0, r3, lsl #1
++T add r5, r0, r3
++ vst2.32 {d16,d18}, [r0]!
++A lsl r3, #1
++ vst2.32 {d17,d19}, [r6], r3
++ asr r3, #1
++ vst2.32 {d20,d22}, [r5]
++ mov r5, #4
++ vst2.32 {d21,d23}, [r6]
++ bx lr
++
++patch_h_up_c_4x4_10:
++ vld1.16 {q1}, [r2]
++ rsb r12, r6, #32
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ mov r5, #4
++1:
++ adds r8, r7
++ vmov q12, q1
++ it mi
++ ldrmi r6, [r2, #-4]!
++ vext.16 q1, q1, q1, #6
++ itt pl
++ asrpl r6, r8, #8
++ ldrpl r6, [r1, r6, lsl #2]
++ vmov s4, r6
++patch_h_up_c_4x4_10_continue:
++2:
++ vmov q8, q9
++ subs r12, r4
++ vmul.u16 q0, q12, q3
++ it cc
++ addcc r12, #32
++ vmla.u16 q0, q1, q2
++ rsb r6, r12, #32
++ vmov q9, q10
++ sub r5, #1
++ vmov q10, q11
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vrshr.u16 q11, q0, #5
++ bhi 2b
++ bne 1b
++
++ bcs store_tran_c_4x4_10
++ adds r8, r7
++ vmov q12, q1
++ it mi
++ ldrmi r6, [r2, #-4]!
++ vext.16 q1, q1, q1, #6
++ itt pl
++ asrpl r6, r8, #8
++ ldrpl r6, [r1, r6, lsl #2]
++ vmov s4, r6
++ b store_tran_c_4x4_10
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #2
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ bl patch_h_down_c_4x4_10
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r8, #-128
++ sub r8, r7
++ bl patch_h_up_c_4x4_10
++ pop {r4-r8, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.16 {q9}, [r1]
++ sub r1, r2, #4
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ vdup.16 q2, r6
++ vext.16 q8, q9, q9, #6
++ sub r8, r7, #128
++ vld1.32 {d16[0]}, [r1]
++ vdup.16 q3, r12
++ mov r5, #3
++1:
++ vmul.u16 q0, q9, q3
++ subs r12, r4
++ vmla.u16 q0, q8, q2
++ ittt cc
++ asrcc r1, r8, #8
++ addcc r12, #32
++ addcc r1, r2, r1, lsl #2
++ vext.16 q10, q8, q8, #6
++ rsb r6, r12, #32
++ vmov q11, q8
++ sub r5, #1
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r8, r7
++ vld1.32 {d20[0]}, [r1]
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmul.u16 q0, q11, q3
++ subs r12, r4
++ vmla.u16 q0, q10, q2
++ ittt cc
++ asrcc r1, r8, #8
++ addcc r12, #32
++ addcc r1, r2, r1, lsl #2
++ vext.16 q8, q10, q10, #6
++ rsb r6, r12, #32
++ vmov q9, q10
++ sub r5, #1
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r8, r7
++ vld1.32 {d16[0]}, [r1]
++ teq r5, #0
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmul.u16 q0, q11, q3
++ vmla.u16 q0, q10, q2
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r8, pc}
++4:
++ bcc 3b
++5:
++ vmul.u16 q0, q9, q3
++ vmla.u16 q0, q8, q2
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ vld1.16 {q9}, [r1]!
++ rsb r12, r6, #32
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vext.16 q8, q9, q9, #2
++ vld1.32 {d17[1]}, [r1]!
++ mov r5, #3
++1:
++ vmul.u16 q0, q8, q2
++ subs r12, r4
++ vmla.u16 q0, q9, q3
++ it cc
++ addcc r12, #32
++ vext.16 q10, q8, q8, #2
++ rsb r6, r12, #32
++ vld1.32 {d21[1]}, [r1]
++ sub r5, #1
++ vmov q11, q8
++ teq r5, #0
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r1, #4
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 1b
++ beq 4f
++2:
++ vmul.u16 q0, q10, q2
++ subs r12, r4
++ vmla.u16 q0, q11, q3
++ it cc
++ addcc r12, #32
++ vext.16 q8, q10, q10, #2
++ rsb r6, r12, #32
++ vld1.32 {d17[1]}, [r1]
++ sub r5, #1
++ vmov q9, q10
++ teq r5, #0
++ vrshr.u16 q0, q0, #5
++ it cc
++ addcc r1, #4
++ vdup.16 q2, r6
++ vdup.16 q3, r12
++ vst1.16 {q0}, [r0], r3
++ bhi 2b
++ bne 1b
++ bcc 5f
++3:
++ vmul.u16 q0, q10, q2
++ vmla.u16 q0, q11, q3
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r8, pc}
++4:
++ bcc 3b
++5:
++ vmul.u16 q0, q8, q2
++ vmla.u16 q0, q9, q3
++ vrshr.u16 q0, q0, #5
++ vst1.16 {q0}, [r0]
++
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r8, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #2
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ mov r1, r2 @ save r2 - r1 unused by patch_down
++
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10_continue
++
++ add r2, r1, #4*4 @ restore r2, but 4 rows further down left
++ sub r0, #32
++ mov r6, r4
++ add r0, r0, r3, lsl #2
++
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10_continue
++
++ pop {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++ ldrh r7, [r7]
++ mov r8, #-128
++ sub r8, r7
++
++ push {r2, r8}
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10_continue
++ pop {r2, r8}
++
++ sub r0, #32
++ mov r6, r4
++ add r2, #16
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10_continue
++
++ pop {r4-r8, pc}
++
++@ Left of vertical - works down left
++18:
++ vld1.16 {q0-q1}, [r1]
++ sub r9, r2, #4
++ rsb r12, r6, #32
++ ldrh r7, [r7]
++ mov r8, #-128
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ mov r5, #8
++1:
++ vld1.32 {d17[1]}, [r9]
++ add r8, r7
++ vmov q2, q0
++ vmov q3, q1
++ asr r9, r8, #8
++ vext.16 q1, q0, q1, #6
++ add r9, r2, r9, lsl #2
++ vext.16 q0, q8, q0, #6
++2:
++ vmul.u16 q11, q2, q10
++ subs r12, r4
++ vmla.u16 q11, q0, q9
++ it cc
++ addcc r12, #32
++ vmul.u16 q12, q3, q10
++ rsb r6, r12, #32
++ vmla.u16 q12, q1, q9
++ sub r5, #1
++ teq r5, #0
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ vrshr.u16 q11, q11, #5
++ vrshr.u16 q12, q12, #5
++ vst1.16 {q11-q12}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.16 {q0-q1}, [r1]!
++ rsb r12, r6, #32
++ vld1.32 {d16[0]}, [r5]
++ mov r5, #8
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++1:
++ vmov q2, q0
++ add r1, #4
++ vmov q3, q1
++ vext.16 q0, q0, q1, #2
++ vext.16 q1, q1, q8, #2
++2:
++ vmul.u16 q11, q0, q9
++ subs r12, r4
++ vmla.u16 q11, q2, q10
++ it cc
++ addcc r12, #32
++ vmul.u16 q12, q1, q9
++ rsb r6, r12, #32
++ vmla.u16 q12, q3, q10
++ sub r5, #1
++ vld1.32 {d16[0]}, [r1]
++ teq r5, #0
++ vdup.16 q9, r6
++ vdup.16 q10, r12
++ vrshr.u16 q11, q11, #5
++ vrshr.u16 q12, q12, #5
++ vst1.16 {q11-q12}, [r0], r3
++ bhi 2b
++ bne 1b
++
++ pop {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride [r3]
++@ unsigned int mode [sp, #0] 2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
++ ldr r12, [sp]
++ push {r4-r10, lr}
++ ADRT r4, angle_2 - 2
++ ADRT r7, inv_angle - 11*2
++ add r7, r7, r12, lsl #1
++ lsl r3, #2
++ vpush {d8}
++ ldrsb r6, [r4, r12]
++ cmp r12, #26
++ ldrsb r4, [r4, r12]
++ bge 26f
++ cmp r12, #18
++ bge 18f
++ cmp r12, #10
++ bge 10f
++
++@ Down of Horizontal - works down left
++ add sp, #8
++ mov r10, #4
++ mov r1, r2
++1:
++ bl patch_h_down_c_4x4_10
++ bl patch_h_down_c_4x4_10_continue
++ bl patch_h_down_c_4x4_10_continue
++ bl patch_h_down_c_4x4_10_continue
++
++ add r2, r1, #4*4 @ restore r2, but 4 rows further down left
++ add r1, r1, #4*4
++ mov r6, r4
++ sub r0, #64
++ subs r10, #1
++ add r0, r0, r3, lsl #2
++ bne 1b
++
++ pop {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++ add sp, #8
++ mov r10, #4
++ ldrh r7, [r7]
++ mov r8, #-128
++ sub r8, r7
++2:
++ push {r2, r8}
++ bl patch_h_up_c_4x4_10
++ bl patch_h_up_c_4x4_10_continue
++ bl patch_h_up_c_4x4_10_continue
++ bl patch_h_up_c_4x4_10_continue
++ pop {r2, r8}
++
++ sub r0, #64
++ mov r6, r4
++ add r2, #16
++ sub r8, r8, r7, lsl #2
++ add r0, r0, r3, lsl #2
++ subs r10, #1
++ bne 2b
++
++ pop {r4-r10, pc}
++
++@ Left of vertical - works down left
++18:
++ add r5, r1, #32
++ vld1.16 {q1-q2}, [r1]
++ rsb r12, r6, r6, lsl #16
++ vld1.16 {q3-q4}, [r5]
++ sub r9, r2, #4
++ rsb r4, r12, #0
++ rsb r12, r12, #32 << 16
++ ldrh r7, [r7]
++ mov r8, #-128
++ vmov d0, d9
++ vmov s2, r12
++ add r10, r0, #32
++ mov r5, #16
++1:
++ vld1.32 {d1[1]}, [r9]
++ add r8, r7
++ vmov q11, q4
++ vmov q10, q3
++ asr r9, r8, #8
++ vmov q9, q2
++ add r9, r2, r9, lsl #2
++ vmov q8, q1
++ vext.16 q4, q3, q4, #6
++ vext.16 q3, q2, q3, #6
++ vext.16 q2, q1, q2, #6
++ vext.16 q1, q0, q1, #6
++2:
++ vmul.u16 q12, q8, d1[1]
++ adds r12, r4
++ vmla.u16 q12, q1, d1[0]
++ it cc
++ addcc r12, #32 << 16
++ vmul.u16 q13, q9, d1[1]
++ it cc
++ subcc r12, #32
++ vmla.u16 q13, q2, d1[0]
++ sub r5, #1
++ vmul.u16 q14, q10, d1[1]
++ teq r5, #0
++ vmla.u16 q14, q3, d1[0]
++ vmul.u16 q15, q11, d1[1]
++ vmla.u16 q15, q4, d1[0]
++ vmov s2, r12
++ vrshr.u16 q12, q12, #5
++ vrshr.u16 q13, q13, #5
++ vrshr.u16 q14, q14, #5
++ vrshr.u16 q15, q15, #5
++ vst1.16 {q12-q13}, [r0], r3
++ vst1.16 {q14-q15}, [r10], r3
++ bhi 2b
++ bne 1b
++
++ vpop {d8}
++ vmov d9, d0
++ pop {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++ add r5, r1, #32
++ vld1.16 {q1-q2}, [r1]
++ rsb r12, r6, r6, lsl #16
++ vld1.16 {q3-q4}, [r5]
++ add r1, r1, #64
++ rsb r4, r12, #0
++ rsb r12, r12, #32 << 16
++ vmov d1, d9
++ vmov s1, r12
++ add r10, r0, #32
++ mov r5, #16
++1:
++ vld1.32 {d0[0]}, [r1]!
++ vmov q8, q1
++ vmov q9, q2
++ vmov q10, q3
++ vmov q11, q4
++ vext.16 q1, q1, q2, #2
++ vext.16 q2, q2, q3, #2
++ vext.16 q3, q3, q4, #2
++ vext.16 q4, q4, q0, #2
++2:
++ vmul.u16 q12, q1, d0[2]
++ adds r12, r4
++ vmla.u16 q12, q8, d0[3]
++ it cc
++ addcc r12, #32 << 16
++ vmul.u16 q13, q2, d0[2]
++ it cc
++ subcc r12, #32
++ vmla.u16 q13, q9, d0[3]
++ sub r5, #1
++ vmul.u16 q14, q3, d0[2]
++ teq r5, #0
++ vmla.u16 q14, q10, d0[3]
++ vmul.u16 q15, q4, d0[2]
++ vmla.u16 q15, q11, d0[3]
++ vmov s1, r12
++ vrshr.u16 q12, q12, #5
++ vrshr.u16 q13, q13, #5
++ vrshr.u16 q14, q14, #5
++ vrshr.u16 q15, q15, #5
++ vst1.16 {q12-q13}, [r0], r3
++ vst1.16 {q14-q15}, [r10], r3
++ bhi 2b
++ bne 1b
++
++ vpop {d8}
++ vmov d9, d1
++ pop {r4-r10, pc}
++
++endfunc
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+@@ -0,0 +1,705 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_8, export=1
++
++ @ Average the els of top & left
++ ldr r2, [r2]
++ vld1.32 {d0[0]}, [r1]
++ mov r1, #2
++ vmov s1, r2
++ vmov s2, r2
++ vmov.i16 q2, #3
++ add r2, r0, r3
++ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0]
++ lsl r3, #1
++ vmovl.u8 q0, d0
++ vmov.i64 d7, #0xffff
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmov.i64 d7, #0xff
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #3
++ vmla.i16 q0, q2, d6[0]
++ vdup.8 d6, d6[0]
++ vrshrn.i16 d0, q0, #2
++
++ @ Store top line
++ vst1.32 {d0[0]}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d1, d0, #5*8
++ vshr.u64 d2, d0, #6*8
++ vshr.u64 d3, d0, #7*8
++ vbif d1, d6, d7
++ vbif d2, d6, d7
++ vst1.32 {d1[0]}, [r2], r3
++ vbif d3, d6, d7
++ vst1.32 {d2[0]}, [r0]
++ vst1.32 {d3[0]}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {d0}, [r1]
++ vld1.8 {d1}, [r2]
++A add r2, r0, r3, lsl #1
++A lsl r3, #2
++T lsl r3, #1
++T add r2, r0, r3
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d2, d0, d0 @ This adds U & V separately
++ vpadd.i32 d3, d0, d0
++ vrshrn.u16 d0, q1, #3
++
++ @ Store
++ vst1.8 {d0}, [r0], r3
++ vst1.8 {d0}, [r2], r3
++ vst1.8 {d0}, [r0]
++ vst1.8 {d0}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {d0}, [r1]
++ mov r1, #2
++ vld1.8 {d16}, [r2]
++ vmov.i16 q2, #3
++ vmov.i64 d7, #0xffff
++ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0]
++ vmovl.u8 q0, d0
++ vadd.i16 d6, d2, d3 @ d6 has 4 vals
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmov.i64 d7, #0xff
++ vmovl.u8 q1, d16
++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #4
++ vmla.i16 q1, q2, d6[0]
++ vmla.i16 q0, q2, d6[0]
++ vdup.8 d6, d6[0]
++ vrshrn.i16 d2, q1, #2
++ vrshrn.i16 d0, q0, #2
++
++ @ Store top line
++ vst1.8 {d0}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d2, #8
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ mov r1, #6
++1:
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ subs r1, #2
++ vbit d6, d2, d7
++ vshr.u64 d2, #8
++ vst1.8 {d6}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0}, [r1]
++ mov r1, #8
++ vld1.8 {q1}, [r2]
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++A add r2, r0, r3, lsl #1
++A lsl r3, #2
++T add r2, r0, r3
++T lsl r3, #1
++ vaddl.u8 q1, d2, d3
++ vadd.i16 q1, q0
++ vadd.i16 d3, d2 @ d3 has 2 val pairs
++ vpadd.i32 d2, d3, d3 @ This add U & V separately
++ vpadd.i32 d3, d3, d3
++ vrshrn.u16 d0, q1, #4
++ vrshrn.u16 d1, q1, #4
++
++ @ Store
++1:
++ vst1.8 {q0}, [r0], r3
++ subs r1, #4
++ vst1.8 {q0}, [r2], r3
++ vst1.8 {q0}, [r0], r3
++ vst1.8 {q0}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q8}, [r1]
++ mov r1, #2
++ vld1.8 {q9}, [r2]
++ vaddl.u8 q10, d16, d17
++ vaddl.u8 q11, d16, d18
++ vaddl.u8 q0, d18, d19
++ vmov.i16 q1, #3
++ vadd.i16 q10, q0
++ vmovl.u8 q0, d18
++ vadd.i16 d20, d21
++ vmov.i16 d2[0], r1 @ 2, 3, 3, 3...
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vmovl.u8 q2, d16
++ vmovl.u8 q9, d19
++ vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same)
++ vmov.i64 d7, #0xffff
++ vmovl.u8 q8, d17
++ vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7]
++ vmov.i64 d7, #0xff
++ vpadd.i16 d20, d20 @ 1 (all the same)
++ vrshr.u16 d21, d20, #5
++ vrshr.u16 d20, d20, #5
++ vmla.i16 q0, q10, d2[1]
++ vmla.i16 q9, q10, d2[1]
++ vmla.i16 q2, q10, q1
++ vmla.i16 q8, q10, d2[1]
++ vdup.8 q1, d20[0]
++ vrshrn.i16 d0, q0, #2
++ vrshrn.i16 d1, q9, #2
++ vrshrn.i16 d4, q2, #2
++ vrshrn.i16 d5, q8, #2
++ vext.8 q0, q0, q0, #1
++
++ @ Store top line
++ vst1.8 {q2}, [r0], r3
++
++ @ Store the rest
++ mov r1, #15
++1:
++ vbit d2, d0, d7
++ vext.8 q0, q0, q0, #1
++ subs r1, #1
++ vst1.8 {q1}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0-q1}, [r1]
++ mov r1, #16
++ vld1.8 {q2-q3}, [r2]
++T lsl r3, #1
++ vaddl.u8 q0, d0, d1
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vaddl.u8 q1, d2, d3
++A lsl r3, #2
++T lsl r3, #1
++ vaddl.u8 q2, d4, d5
++ vaddl.u8 q3, d6, d7
++ vadd.i16 q0, q1
++ vadd.i16 q2, q3
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d4, d0, d0 @ This adds U & V separately
++ vpadd.i32 d5, d0, d0
++ vrshrn.u16 d0, q2, #5
++ vrshrn.u16 d1, q2, #5
++ vrshrn.u16 d2, q2, #5
++ vrshrn.u16 d3, q2, #5
++
++ @ Store
++1:
++ vst1.8 {q0-q1}, [r0], r3
++ subs r1, #2
++ vst1.8 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_32_neon_8, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0-q1}, [r1]
++ mov r1, #32
++ vld1.8 {q2-q3}, [r2]
++ add r2, r0, r3
++ vaddl.u8 q0, d0, d1
++ lsl r3, #1
++ vaddl.u8 q1, d2, d3
++ vaddl.u8 q2, d4, d5
++ vaddl.u8 q3, d6, d7
++ vadd.i16 q0, q1
++ vadd.i16 q2, q3
++ vadd.i16 q0, q2
++ vadd.i16 d0, d1 @ d0 has 4 vals
++ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
++ vpadd.i16 d4, d0, d0 @ 1 (all the same)
++ vpadd.i16 d5, d0, d0
++ vrshrn.u16 d0, q2, #6
++ vrshrn.u16 d1, q2, #6
++ vrshrn.u16 d2, q2, #6
++ vrshrn.u16 d3, q2, #6
++
++ @ Store
++1:
++ vst1.8 {q0-q1}, [r0], r3
++ subs r1, #2
++ vst1.8 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ -----------------------------------------------------------------------------
++@
++@ 10 Bit versions
++@
++@ There is no actual bit depth dependency in this code except that our
++@ intermediate results will overflow the 16 bits they are stored in
++@ All there functions are good to 10 bits - with the worst case being
++@ in dc_32 where we use all 16 bits.
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {d0}, [r1]
++ mov r1, #2
++ vld1.16 {d1}, [r2]
++T lsl r3, #1
++ vmov.i16 q2, #3
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0]
++A lsl r3, #2
++T lsl r3, #1
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vmov.i64 d7, #0xffff
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #3
++ vmla.i16 q0, q2, d6[0]
++ vrshr.u16 q0, #2
++
++ @ Store top line
++ vst1.16 {d0}, [r0], r3
++
++ @ Store the rest
++ vshr.u64 d3, d1, #1*16
++ vshr.u64 d4, d1, #2*16
++ vshr.u64 d5, d1, #3*16
++ vbif d3, d6, d7
++ vbif d4, d6, d7
++ vst1.16 {d3}, [r2], r3
++ vbif d5, d6, d7
++ vst1.16 {d4}, [r0]
++ vst1.16 {d5}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.8 {q0}, [r1]
++ vld1.8 {q1}, [r2]
++A add r2, r0, r3, lsl #2
++A lsl r3, #3
++T lsl r3, #2
++T add r2, r0, r3
++T lsl r3, #1
++ vadd.i16 q0, q1
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d2, d0, d0 @ This adds U & V separately
++ vpadd.i32 d3, d0, d0
++ vrshr.u16 q0, q1, #3
++
++ vst1.16 {q0}, [r0], r3
++ vst1.16 {q0}, [r2], r3
++ vst1.16 {q0}, [r0]
++ vst1.16 {q0}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q0}, [r1]
++ mov r1, #2
++ vld1.16 {q8}, [r2]
++T lsl r3, #1
++ vmov.i16 q2, #3
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0]
++A lsl r3, #2
++T lsl r3, #1
++ vmov.i64 d7, #0xffff
++ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
++ vadd.i16 d6, d2, d3 @ d6 has 4 vals
++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7]
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ top_line[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d6, d6 @ 1 (all the same)
++ vrshr.u16 d6, #4
++ vmla.i16 q8, q2, d6[0]
++ vmla.i16 q0, q2, d6[0]
++ vdup.16 q2, d6[0]
++ vdup.16 q9, d6[0]
++ vrshr.u16 q8, q8, #2
++ vrshr.u16 q0, q0, #2
++ vext.16 q1, q8, q8, #1
++
++ @ Store top line
++ vst1.16 {q0}, [r0], r3
++
++ @ Store the rest
++ vbit d18, d2, d7
++ vst1.16 {q9}, [r2], r3
++ mov r1, #6
++1:
++ vext.16 q8, q8, q8, #2
++ subs r1, #2
++ vext.16 q1, q1, q1, #2
++ vbit d4, d16, d7
++ vst1.16 {q2}, [r0], r3
++ vbit d18, d2, d7
++ vst1.16 {q9}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q0-q1}, [r1]
++ mov r1, #8
++ vld1.16 {q2-q3}, [r2]
++T lsl r3, #2
++ vadd.i16 q1, q0
++A add r2, r0, r3, lsl #2
++A lsl r3, #3
++T add r2, r0, r3
++T lsl r3, #1
++ vadd.i16 q2, q3
++ vadd.i16 q1, q2
++ vadd.i16 d3, d2 @ d3 has 2 val pairs
++ vpadd.i32 d2, d3, d3 @ This add U & V separately
++ vpadd.i32 d3, d3, d3
++ vrshr.u16 q0, q1, #4
++ vrshr.u16 q1, q1, #4
++
++ @ Store
++1:
++ vst1.8 {q0-q1}, [r0], r3
++ subs r1, #2
++ vst1.8 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_10, export=1
++
++ @ Average the els of top & left
++ vld1.16 {q8-q9}, [r1]
++ mov r1, #2
++ vld1.16 {q10-q11}, [r2]
++ lsl r3, #1 @ stride given in pels
++ vadd.i16 q0, q8, q9
++ vadd.i16 q1, q10, q11
++ vmov.i16 q3, #3
++ vadd.i16 q1, q0
++ vadd.i16 d0, d16, d20
++ vmov.i64 d31, #0xffff
++ vadd.i16 d3, d2
++ vmov.16 d6[0], r1 @ 2, 3, 3, 3...
++
++ @ top line gets some smoothing
++ @ (top[i] + 3*dc + 2) >> 2
++ @ as does left
++ @ topline[0] is extra special
++ @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++ vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7]
++ vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same)
++ vpadd.i16 d3, d3 @ 1 (all the same)
++ vrshr.u16 d2, d3, #5
++ vrshr.u16 d3, d3, #5
++ vmov q0, q1
++ vmla.i16 q10, q1, d6[1]
++ vmla.i16 q11, q1, d6[1]
++ vmla.i16 q8, q1, q3
++ vmla.i16 q9, q1, d6[1]
++ vrshr.u16 q2, q10, #2
++ vrshr.u16 q3, q11, #2
++ vrshr.u16 q8, #2
++ vrshr.u16 q9, #2
++ vext.16 q2, q2, q2, #1
++ mov r1, #7<<29
++
++ @ Store top line
++ vst1.16 {q8-q9}, [r0], r3
++
++ @ Store the rest
++1:
++ vbit d0, d4, d31
++ vext.16 q2, q2, q2, #1
++ subs r1, #1<<29
++ vst1.16 {q0-q1}, [r0], r3
++ bne 1b
++1:
++ vbit d0, d6, d31
++ vext.16 q3, q3, q3, #1
++ subs r1, #1<<29
++ vst1.16 {q0-q1}, [r0], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
++
++ @ Average the els of top & left
++ vldm r1, {q0-q3}
++ vldm r2, {q8-q11}
++ vadd.i16 q0, q1
++ mov r1, #16
++ vadd.i16 q2, q3
++ add r2, r0, #32
++ vadd.i16 q8, q9
++ lsl r3, #2
++ vadd.i16 q10, q11
++ vadd.u16 q0, q2
++ vadd.u16 q8, q10
++ vadd.i16 q0, q8
++ vadd.i16 d0, d1 @ d0 has 2 val pairs
++ vpadd.i32 d4, d0, d0 @ This adds U & V separately
++ vpadd.i32 d5, d0, d0
++ vrshr.u16 q0, q2, #5
++ vrshr.u16 q1, q2, #5
++
++ @ Store
++1:
++ vst1.16 {q0-q1}, [r0], r3
++ subs r1, #1
++ vst1.16 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3] (In pels)
++
++function ff_hevc_rpi_pred_dc_32_neon_10, export=1
++
++ @ Average the els of top & left
++ @ With 10 bits we are (just) safe from overflow in i16
++ vldm r1, {q0-q3}
++ vldm r2, {q8-q11}
++ vadd.i16 q0, q1
++ mov r1, #32
++ vadd.i16 q2, q3
++ add r2, r0, #32
++ vadd.i16 q8, q9
++ lsl r3, #1
++ vadd.i16 q10, q11
++ vadd.u16 q0, q2
++ vadd.u16 q8, q10
++ vadd.i16 q0, q8
++ vadd.i16 d0, d1 @ d0 has 4 vals
++ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
++ vpadd.i16 d4, d0, d0 @ 1 (all the same)
++ vpadd.i16 d5, d0, d0
++ vrshr.u16 q0, q2, #6
++ vrshr.u16 q1, q2, #6
++
++ @ Store
++1:
++ vst1.16 {q0-q1}, [r0], r3
++ subs r1, #1
++ vst1.16 {q0-q1}, [r2], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
+@@ -0,0 +1,881 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ All functions have the call
++@
++@ int ff_hevc_rpi_intra_filter_N_neon_PW(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++@
++@ Assumptions:
++@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
++@ if reuseing this code)
++@
++@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
++@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore
++@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
++@
++@ We always have at least 64 pixel H frame width rounding - this lets us
++@ load UR widthout having to worry about exactly how many pixels are actually
++@ within the frame. As partial loads will only occur very occasionally this
++@ should be a win in nearly all cases.
++@
++@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
++@ so we do no maths on the contents
++@
++@ No filtering in 32bit fns as they are chroma only
++
++
++.equ AVAIL_UR, 1
++.equ AVAIL_U, 2
++.equ AVAIL_UL, 4
++.equ AVAIL_L, 8
++.equ AVAIL_DL, 16
++
++.equ FILTER_LIGHT, 0x40
++.equ FILTER_STRONG, 0x80
++
++.equ AVAIL_S_UR_N_U_C, 32 - 1
++.equ AVAIL_S_U_N_UL_C, 32 - 2
++.equ AVAIL_S_UL_N_L_C, 32 - 3
++.equ AVAIL_S_L_N_DL_C, 32 - 4
++
++.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr
++
++@ On entry
++@ r2 req
++@ r3 avail
++@ [sp, #sp_offset...] args
++@
++@ On Exit:
++@
++@ Extend values:
++@ d_l scalar contains value for L & DL
++@ if DL avail then this is is DL[0] so we don't need to load that
++@ d_ul scalar containing value for UL
++@ d_u scalar containing value for U
++@ d_ur scalar containing value for UR
++@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
++@ This means that L-light-filter works even if nreq DL (we never filter
++@ req-DL without req-L, but we do filter req-L without req-DL)
++@ If UR avail then d_ur == a_ur so U-filter good too
++@
++@ Data load pointers (only load if req & avail):
++@ r4 DL + stride
++@ r10 L
++@ r6 U
++@ r5 UR
++@
++@ Others:
++@ r2 req
++@ r7 req & avail
++@ r3 L + stride
++@ r8 DL + stride * 2
++@ r9 stride * 2
++@ cs Load U
++@ mi Load UR
++@
++@ Clobbered:
++@ r12
++
++.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
++
++.equ src_l\@, \sp_offset + 0
++.equ src_u\@, \sp_offset + 4
++.equ src_ur\@, \sp_offset + 8
++.equ stride\@, \sp_offset + 12
++.equ pw\@, (1 << \pw_s) @ pel width in bytes
++.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes
++
++@ r9 stride
++@ r7 = ab_ul, r6 = a_u, r5 = a_ur
++@ r4 = b_dl, r10 = b_l, r8 = b_u
++
++ ldr r5, [sp, #src_ur\@]
++ lsl r12, r3, #AVAIL_S_U_DL_CPSR
++ ldr r10, [sp, #src_l\@]
++ ldr r9, [sp, #stride\@]
++ ldr r6, [sp, #src_u\@]
++
++ @ This is quite a slow instruction but it replaces
++ @ a decent number of tests that yield a max of 2 flags/op
++ @ It is annoying we can't branch on Q!
++ @ If L navail (ne) then DL must be navail (pl)
++ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur
++
++ mov r4, r5
++ sub r7, r10, r9
++ it vs
++ movvs r4, r6
++ add r8, r6, #b_size\@ - pw\@
++ it cs
++ movcs r4, r7
++ ite ne
++ movne r10, r4
++ addeq r4, r7, r9, lsl #\log2_s
++ it cc
++ movcc r7, r10
++ it mi
++ addmi r4, r10, r9, lsl #\log2_s
++ vld1.\d_type {\d_ul}, [r7]
++ itt vc
++ movvc r8, r7
++ movvc r6, r7
++ vld1.\d_type {\d_l }, [r4], r9
++ tst r3, #AVAIL_UR
++ vld1.\d_type {\d_u }, [r6]
++ it eq
++ moveq r5, r8
++ and r7, r2, r3
++ add r8, r4, r9
++ vld1.\d_type {\d_ur}, [r5]
++ lsls r12, r7, #AVAIL_S_UR_N_U_C
++ add r3, r10, r9
++ lsl r9, #1
++.endm
++
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_8(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 0
++.set pw, (1 << pw_s)
++.set log2_s, 2
++
++function ff_hevc_rpi_intra_filter_4_neon_8, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
++
++ it cs
++ vldrcs s2, [r6]
++ ite pl
++ vmovpl s3, s4
++ vldrmi s3, [r5]
++
++ lsls r7, #AVAIL_S_L_N_DL_C
++ add r12, r0, #-pw
++ bpl 1f
++
++ vld1.8 {d0[0]}, [r10], r9
++ vld1.8 {d0[1]}, [r3], r9
++ vld1.8 {d0[2]}, [r10]
++ vld1.8 {d0[3]}, [r3]
++1:
++ bcc 1f
++ vld1.8 {d0[5]}, [r4], r9
++ vld1.8 {d0[6]}, [r8]
++ vld1.8 {d0[7]}, [r4]
++1:
++ vstr d1, [r1] @ Up
++ vst1.8 {d31[7]}, [r12]
++ vstr d0, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_16(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 1
++.set pw, (1 << pw_s)
++.set log2_s, 2
++
++function ff_hevc_rpi_intra_filter_4_neon_16, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
++
++ it cs
++ vldrcs d2, [r6]
++ it mi
++ vldrmi d3, [r5]
++ lsls r7, #AVAIL_S_L_N_DL_C
++ add r12, r0, #-pw
++ bpl 1f
++ vld1.16 {d0[0]}, [r10], r9
++ vld1.16 {d0[1]}, [r3], r9
++ vld1.16 {d0[2]}, [r10]
++ vld1.16 {d0[3]}, [r3]
++1:
++ bcc 1f
++ vld1.16 {d1[1]}, [r4], r9
++ vld1.16 {d1[2]}, [r8]
++ vld1.16 {d1[3]}, [r4]
++1:
++ vst1.16 {q1}, [r1] @ Up
++ vst1.16 {d31[3]}, [r12]
++ vst1.16 {q0}, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_8(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 0
++.set pw, (1 << pw_s)
++.set log2_s, 3
++
++function ff_hevc_rpi_intra_filter_8_neon_8, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
++
++ it cs
++ vldrcs d4, [r6]
++ it mi
++ vldrmi d5, [r5]
++
++ lsls r7, #AVAIL_S_L_N_DL_C
++ bpl 1f
++ vld1.8 {d0[0]}, [r10], r9
++ vld1.8 {d0[1]}, [r3], r9
++ vld1.8 {d0[2]}, [r10], r9
++ vld1.8 {d0[3]}, [r3], r9
++ vld1.8 {d0[4]}, [r10], r9
++ vld1.8 {d0[5]}, [r3], r9
++ vld1.8 {d0[6]}, [r10]
++ vld1.8 {d0[7]}, [r3]
++1:
++ bcc 1f
++ vld1.8 {d1[1]}, [r4], r9
++ vld1.8 {d1[2]}, [r8], r9
++ vld1.8 {d1[3]}, [r4], r9
++ vld1.8 {d1[4]}, [r8], r9
++ vld1.8 {d1[5]}, [r4], r9
++ vld1.8 {d1[6]}, [r8]
++ vld1.8 {d1[7]}, [r4]
++1:
++ tst r2, #FILTER_LIGHT
++ add r12, r0, #-pw
++ beq 10f
++
++ @ Luma light filter
++ vext.8 q8, q15, q2, #15
++ vext.8 q12, q15, q0, #15
++ vaddl.u8 q9, d17, d5
++ vaddl.u8 q8, d16, d4
++ vaddl.u8 q13, d25, d1
++ vaddl.u8 q12, d24, d0
++ vmov.u8 r3, d5[7] @ Save final pel
++ vmov.u8 r2, d1[7] @ Save final pel
++
++ vext.16 q2, q8, q9, #1
++ vext.16 q3, q9, q9, #1
++ vext.16 q0, q12, q13, #1
++ vext.16 q1, q13, q13, #1
++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0]
++ vadd.u16 q2, q8
++ vadd.u16 q3, q9
++ vadd.u16 q0, q12
++ vadd.u16 q1, q13
++
++ vrshrn.u16 d4, q2, #2
++ vrshrn.u16 d5, q3, #2
++ vrshrn.u16 d0, q0, #2
++ vrshrn.u16 d1, q1, #2
++ vrshr.u16 d30, #2
++ vmov.u8 d5[7], r3 @ Restore final pel
++ vmov.u8 d1[7], r2 @ Restore final pel
++ vdup.u8 d31, d30[0] @ d31[3] = d30[0]
++
++10:
++ vst1.8 {q2 }, [r1] @ Up
++ vst1.8 {d31[7]}, [r12] @ Up-left
++ vst1.8 {q0 }, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_16(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 1
++.set pw, (1 << pw_s)
++.set log2_s, 3
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_16, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
++
++ it cs
++ vldmcs r6, {d4, d5}
++ ldr r12, [sp, #ur_size]
++ bpl 1f
++ cmp r12, #4
++ vldm r5, {d6, d7}
++ bgt 1f
++ vdup.16 d7, d6[3]
++1:
++ lsls r12, r7, #AVAIL_S_L_N_DL_C
++ vdup.16 q1, d0[0]
++ bpl 1f
++ vld1.16 {d0[0]}, [r10], r9
++ vld1.16 {d0[1]}, [r3], r9
++ vld1.16 {d0[2]}, [r10], r9
++ vld1.16 {d0[3]}, [r3], r9
++ vld1.16 {d1[0]}, [r10], r9
++ vld1.16 {d1[1]}, [r3], r9
++ vld1.16 {d1[2]}, [r10]
++ vld1.16 {d1[3]}, [r3]
++1:
++ bcc 1f
++ ldr r12, [sp, #dl_size]
++ vld1.16 {d2[1]}, [r4], r9
++ cmp r12, #p_size
++ vld1.16 {d2[2]}, [r8], r9
++ vld1.16 {d2[3]}, [r4], r9
++ blt 2f
++ vld1.16 {d3[0]}, [r8], r9
++ vld1.16 {d3[1]}, [r4], r9
++ vld1.16 {d3[2]}, [r8]
++ vld1.16 {d3[3]}, [r4]
++ b 1f
++2:
++ vdup.16 d3, d2[3]
++1:
++ tst r2, #FILTER_LIGHT
++ add r12, r0, #-pw
++ beq 10f
++
++ @ Luma light filter
++ vext.16 q9, q2, q3, #7
++ vext.16 q8, q15, q2, #7
++ vext.16 q13, q0, q1, #7
++ vext.16 q12, q15, q0, #7
++ vadd.u16 q9, q3
++ vadd.u16 q8, q2
++ vadd.u16 q13, q1
++ vadd.u16 q12, q0
++ vmov.u16 r3, d7[3] @ Save final pel
++ vmov.u16 r2, d3[3] @ Save final pel
++
++ vext.16 q2, q8, q9, #1
++ vext.16 q3, q9, q9, #1
++ vext.16 q0, q12, q13, #1
++ vext.16 q1, q13, q13, #1
++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0]
++ vadd.u16 q2, q8
++ vadd.u16 q3, q9
++ vadd.u16 q0, q12
++ vadd.u16 q1, q13
++
++ vrshr.u16 q2, #2
++ vrshr.u16 q3, #2
++ vrshr.u16 q0, #2
++ vrshr.u16 q1, #2
++ vrshr.u16 d30, #2
++ vmov.u16 d7[3], r3 @ Restore final pel
++ vmov.u16 d3[3], r2 @ Restore final pel
++ vdup.u16 d31, d30[0] @ d31[3] = d30[0]
++
++10:
++ vst1.16 {q2, q3}, [r1] @ Up
++ vst1.16 {d31[3]}, [r12] @ Up-left
++ vst1.16 {q0, q1}, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_16_neon_16(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 1
++.set pw, (1 << pw_s)
++.set log2_s, 4
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_16, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
++
++ vdup.16 q9, d16[0]
++ vdup.16 q11, d20[0]
++
++ it cs
++ vldmcs r6, {d16-d19}
++ ldr r12, [sp, #ur_size]
++ bpl 1f
++ cmp r12, #12
++ @ Given chroma frame layout, if UR exists then it is always legit to
++ @ load all of it even if most of it is outside the frame.
++ vldm r5, {d20-d23}
++ bgt 1f
++ bge 4f
++ cmp r12, #8
++ bge 3f
++ vdup.16 d21, d20[3]
++3: vdup.16 d22, d21[3]
++4: vdup.16 d23, d22[3]
++
++1:
++ lsls r7, #AVAIL_S_L_N_DL_C
++ ldr r12, [sp, #dl_size]
++ vdup.16 q1, d0[0]
++ vdup.16 q2, d0[0]
++ vdup.16 q3, d0[0]
++ bpl 1f
++ vld1.16 {d0[0]}, [r10], r9
++ vld1.16 {d0[1]}, [r3], r9
++ vld1.16 {d0[2]}, [r10], r9
++ vld1.16 {d0[3]}, [r3], r9
++ vld1.16 {d1[0]}, [r10], r9
++ vld1.16 {d1[1]}, [r3], r9
++ vld1.16 {d1[2]}, [r10], r9
++ vld1.16 {d1[3]}, [r3], r9
++ vld1.16 {d2[0]}, [r10], r9
++ vld1.16 {d2[1]}, [r3], r9
++ vld1.16 {d2[2]}, [r10], r9
++ vld1.16 {d2[3]}, [r3], r9
++ vld1.16 {d3[0]}, [r10], r9
++ vld1.16 {d3[1]}, [r3], r9
++ vld1.16 {d3[2]}, [r10]
++ vld1.16 {d3[3]}, [r3]
++1:
++ bcc 1f
++ vld1.16 {d4[1]}, [r4], r9
++ cmp r12, #4
++ vld1.16 {d4[2]}, [r8], r9
++ vld1.16 {d4[3]}, [r4], r9
++ ble 2f
++ vld1.16 {d5[0]}, [r8], r9
++ vld1.16 {d5[1]}, [r4], r9
++ cmp r12, #12
++ vld1.16 {d5[2]}, [r8], r9
++ vld1.16 {d5[3]}, [r4], r9
++ blt 3f
++ vld1.16 {d6[0]}, [r8], r9
++ vld1.16 {d6[1]}, [r4], r9
++ vld1.16 {d6[2]}, [r8], r9
++ vld1.16 {d6[3]}, [r4], r9
++ ble 4f
++ vld1.16 {d7[0]}, [r8], r9
++ vld1.16 {d7[1]}, [r4], r9
++ vld1.16 {d7[2]}, [r8]
++ vld1.16 {d7[3]}, [r4]
++ b 1f
++2: vdup.16 d5, d4[3]
++3: vdup.16 d6, d5[3]
++4: vdup.16 d7, d6[3]
++1:
++ tst r2, #FILTER_LIGHT
++ add r12, r0, #-pw
++ beq 10f
++
++ vpush {q5}
++ @ Luma light filter
++ @ Left
++ vext.16 q5, q2, q3, #7
++ vext.16 q14, q1, q2, #7
++ vext.16 q13, q0, q1, #7
++ vext.16 q12, q15, q0, #7
++
++ vadd.u16 q5, q3
++ vadd.u16 q14, q2
++ vadd.u16 q13, q1
++ vadd.u16 q12, q0
++ vmov.u16 r2, d7[3] @ Save final pel
++
++ vext.16 q0, q12, q13, #1
++ vext.16 q1, q13, q14, #1
++ vext.16 q2, q14, q5, #1
++ vext.16 q3, q5, q5, #1
++
++ vmov d30, d24 @ d30[0] = l[0] + ul
++ vadd.u16 q0, q12
++ vadd.u16 q1, q13
++ vadd.u16 q2, q14
++ vadd.u16 q3, q5
++
++ vrshr.u16 q0, #2
++ vrshr.u16 q1, #2
++ vrshr.u16 q2, #2
++ vrshr.u16 q3, #2
++
++ @ Up
++ vext.16 q5, q10, q11, #7
++ vext.16 q14, q9, q10, #7
++ vext.16 q13, q8, q9, #7
++ vext.16 q12, q15, q8, #7
++
++ vadd.u16 q5, q11
++ vadd.u16 q14, q10
++ vadd.u16 q13, q9
++ vadd.u16 q12, q8
++ vmov.u16 r3, d23[3] @ Save final pel
++
++ vext.16 q8, q12, q13, #1
++ vext.16 q9, q13, q14, #1
++ vext.16 q10, q14, q5, #1
++ vext.16 q11, q5, q5, #1
++
++ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0]
++ vadd.u16 q8, q12
++ vadd.u16 q9, q13
++ vadd.u16 q10, q14
++ vadd.u16 q11, q5
++
++ vrshr.u16 q8, #2
++ vrshr.u16 q9, #2
++ vrshr.u16 q10, #2
++ vrshr.u16 q11, #2
++
++ @ Misc
++ vrshr.u16 d30, #2
++ vmov.u16 d7[3], r2 @ Restore final pel
++ vmov.u16 d23[3], r3 @ Restore final pel
++ vdup.u16 d31, d30[0] @ d31[3] = d30[0]
++ vpop {q5}
++
++10:
++ vstm r1, {d16-d23} @ Up
++ vst1.16 {d31[3]}, [r12] @ Up-left
++ vstm r0, { d0-d7 } @ Left
++ pop {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_4_neon_32(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set pw_s, 2
++.set pw, (1 << pw_s)
++.set log2_s, 2
++
++function ff_hevc_rpi_intra_filter_4_neon_32, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
++
++ it cs
++ vldmcs r6, {d4, d5}
++ it mi
++ vldmmi r5, {d6, d7}
++ lsls r7, #AVAIL_S_L_N_DL_C
++ vdup.32 q1, d0[0]
++ add r12, r0, #-pw
++ bpl 1f
++ vld1.32 {d0[0]}, [r10], r9
++ vld1.32 {d0[1]}, [r3], r9
++ vld1.32 {d1[0]}, [r10]
++ vld1.32 {d1[1]}, [r3]
++1:
++ bcc 1f
++ vld1.32 {d2[1]}, [r4], r9
++ vld1.32 {d3[0]}, [r8]
++ vld1.32 {d3[1]}, [r4]
++1:
++ vst1.32 {q2, q3 }, [r1] @ Up
++ vst1.32 {d31[1]}, [r12]
++ vst1.32 {q0, q1 }, [r0] @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_32(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 2
++.set pw, (1 << pw_s)
++.set log2_s, 3
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_32, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
++
++ vdup.32 q9, d16[0]
++ vdup.32 q11, d20[0]
++
++ it cs
++ vldmcs r6, {q8, q9 }
++ ldr r12, [sp, #ur_size]
++ bpl 1f
++ cmp r12, #p_size
++ vldm r5, {q10, q11}
++ bge 1f
++ vdup.32 q11, d21[1]
++1:
++ lsls r7, #AVAIL_S_L_N_DL_C
++ vdup.32 q1, d0[0]
++ vdup.32 q2, d0[0]
++ vdup.32 q3, d0[0]
++ bpl 1f
++ vld1.32 {d0[0]}, [r10], r9
++ vld1.32 {d0[1]}, [r3], r9
++ vld1.32 {d1[0]}, [r10], r9
++ vld1.32 {d1[1]}, [r3], r9
++ vld1.32 {d2[0]}, [r10], r9
++ vld1.32 {d2[1]}, [r3], r9
++ vld1.32 {d3[0]}, [r10]
++ vld1.32 {d3[1]}, [r3]
++1:
++ bcc 1f
++ ldr r12, [sp, #dl_size]
++ vld1.32 {d4[1]}, [r4], r9
++ cmp r12, #p_size
++ vld1.32 {d5[0]}, [r8], r9
++ vld1.32 {d5[1]}, [r4], r9
++ blt 2f
++ vld1.32 {d6[0]}, [r8], r9
++ vld1.32 {d6[1]}, [r4], r9
++ vld1.32 {d7[0]}, [r8]
++ vld1.32 {d7[1]}, [r4]
++ b 1f
++2:
++ vdup.32 q3, d5[1]
++1:
++ add r12, r0, #-pw
++ vstm r1, { q8-q11} @ Up
++ vst1.32 {d31[1]}, [r12]
++ vstm r0, { q0-q3 } @ Left
++ pop {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_16_neon_32(
++@ pixel * const left, [r0]
++@ pixel * const top, [r1]
++@ const unsigned int req, [r2]
++@ const unsigned int avail, [r3]
++@ const pixel * const src_l, [sp, #0]
++@ const pixel * const src_u, [sp, #4]
++@ const pixel * const src_ur, [sp, #8]
++@ const unsigned int stride, [sp, #12] (pels)
++@ const unsigned int top_right_size, [sp, #16]
++@ const unsigned int down_left_size) [sp, #20]
++
++.set sp_base, 8*4
++.set ur_size, sp_base + 16
++.set dl_size, sp_base + 20
++.set pw_s, 2
++.set pw, (1 << pw_s)
++.set log2_s, 4
++.set p_size, (1 << log2_s) @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_32, export=1
++ push {r4-r10, lr}
++ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
++
++ @ Once we get this big we have run out of neon regs to store
++ @ everything at once so do in pieces
++
++ @ Up (have)
++ it cs
++ vldmcs r6, { q0-q3 }
++ ldr r12, [sp, #ur_size]
++ it mi
++ vldmmi r5, { q8-q11}
++ it cs
++ vstmcs r1, { q0-q3 }
++ bpl 1f
++ cmp r12, #12
++ add lr, r1, #(pw << log2_s)
++ bgt 2f
++ cmp r12, #8
++ bge 3f
++ vdup.16 q9, d17[1]
++4: vdup.16 d10, d19[1]
++3: vdup.16 q11, d21[1]
++2: vstm lr, { q8-q11}
++1:
++
++ @ Left (have)
++ add lr, r0, #-pw
++ lsls r12, r7, #AVAIL_S_L_N_DL_C
++ vst1.32 {d30[1]}, [lr] @ UL
++ bpl 1f
++ vld1.32 { d0[0]}, [r10], r9
++ vld1.32 { d0[1]}, [r3], r9
++ vld1.32 { d1[0]}, [r10], r9
++ vld1.32 { d1[1]}, [r3], r9
++ vld1.32 { d2[0]}, [r10], r9
++ vld1.32 { d2[1]}, [r3], r9
++ vld1.32 { d3[0]}, [r10], r9
++ vld1.32 { d3[1]}, [r3], r9
++ vld1.32 { d4[0]}, [r10], r9
++ vld1.32 { d4[1]}, [r3], r9
++ vld1.32 { d5[0]}, [r10], r9
++ vld1.32 { d5[1]}, [r3], r9
++ vld1.32 { d6[0]}, [r10], r9
++ vld1.32 { d6[1]}, [r3], r9
++ vld1.32 { d7[0]}, [r10]
++ vld1.32 { d7[1]}, [r3]
++ vstm r0, { q0-q3 }
++1:
++ bcc 1f
++ ldr r12, [sp, #dl_size]
++ vdup.32 d16, d30[0] @ d16[0] = d30[0]
++ add lr, r0, #(pw << log2_s)
++ vld1.32 {d16[1]}, [r4], r9
++ cmp r12, #4
++ vld1.32 {d17[0]}, [r8], r9
++ vld1.32 {d17[1]}, [r4], r9
++ ble 2f
++ vld1.32 {d18[0]}, [r8], r9
++ vld1.32 {d18[1]}, [r4], r9
++ cmp r12, #12
++ vld1.32 {d19[0]}, [r8], r9
++ vld1.32 {d19[1]}, [r4], r9
++ blt 3f
++ vld1.32 {d20[0]}, [r8], r9
++ vld1.32 {d20[1]}, [r4], r9
++ vld1.32 {d21[0]}, [r8], r9
++ vld1.32 {d21[1]}, [r4], r9
++ ble 4f
++ vld1.32 {d22[0]}, [r8], r9
++ vld1.32 {d22[1]}, [r4], r9
++ vld1.32 {d23[0]}, [r8]
++ vld1.32 {d23[1]}, [r4]
++ b 5f
++2: vdup.32 q9, d17[1]
++3: vdup.32 q10, d19[1]
++4: vdup.32 q11, d21[1]
++5: vstm lr, { q8-q11}
++1:
++ eors r7, r2
++ beq 99f
++
++ lsls r12, r7, #AVAIL_S_UR_N_U_C
++ vdup.32 q0, d31[0]
++ vdup.32 q1, d31[0]
++ vdup.32 q2, d31[0]
++ vdup.32 q3, d31[0]
++ add lr, r1, #(pw << log2_s)
++ vdup.32 q8, d31[1]
++ vdup.32 q9, d31[1]
++ vdup.32 q10, d31[1]
++ vdup.32 q11, d31[1]
++ it cs
++ vstmcs r1, { q0-q3 }
++ it mi
++ vstmmi lr, { q8-q11}
++
++ lsls r7, #AVAIL_S_L_N_DL_C
++ vdup.32 q0, d30[0]
++ vdup.32 q1, d30[0]
++ vdup.32 q2, d30[0]
++ vdup.32 q3, d30[0]
++ add lr, r0, #(pw << log2_s)
++ it mi
++ vstmmi r0, { q0-q3 }
++ it cs
++ vstmcs lr, { q0-q3 }
++
++99:
++ pop {r4-r10, pc}
++endfunc
++
++
++
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+@@ -0,0 +1,920 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++/*
++ * Horizontal & Vertical special cases of angular intra pred
++ *
++ * Split out because:
++ * Vertical, at least, is relatively common
++ * Much simpler code than the general angular case
++ * Luma with size < 32 has extra filtering that doesn't happen anywhere else
++ *
++ * *** Currently luma filtering is mandatory where it occurs, but there are
++ * cases where it should be turned off (rdpcm & an extension sps flag).
++ * These don't occur in the standard conformance suite for Main Profile
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ ff_hevc_rpi_pred_vertical_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.32 {d0[0]}, [r2 :32] @ Left
++ add r2, r0, r3
++ vld1.8 {d1[]}, [r1]
++ lsl r3, #1
++ vdup.8 d4, ip
++ vmov.i8 d2, #128
++ vhsub.u8 d4, d0, d4
++ veor d1, d2
++ vld1.32 {d0[0]}, [r1 :32] @ Top
++ vqadd.s8 d1, d4
++ vmov.i64 d3, #0xff
++ vmov d4, d0
++ veor d5, d1, d2
++ veor d1, d1, d2
++ vbit d0, d1, d3
++ vshr.u64 d5, #8
++ vst1.32 {d0[0]}, [r0], r3
++ vshr.u64 d1, #16
++ vbit d4, d5, d3
++ vshr.u64 d5, #16
++ vst1.32 {d4[0]}, [r2], r3
++ vbit d0, d1, d3
++ vst1.32 {d0[0]}, [r0]
++ vbit d4, d5, d3
++ vst1.32 {d4[0]}, [r2]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.8 {d0}, [r2 :64] @ Left
++ vmov.i8 d1, #128
++ vld1.8 {d2[]}, [r1]
++ vld1.8 {d3}, [r1 :64] @ Top
++ vdup.8 d4, ip
++ vhsub.u8 d4, d0, d4
++ veor d2, d1
++ vmov.i64 d0, #0xff
++ mov r1, #8
++ vqadd.s8 d2, d4, d2
++ veor d1, d2, d1
++1:
++ vbit d3, d1, d0
++ vshr.u64 d1, #8
++ vst1.8 {d3}, [r0 :64], r3
++ subs r1, #2
++ vbit d3, d1, d0
++ vshr.u64 d1, #8
++ vst1.8 {d3}, [r0 :64], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.8 {q0}, [r2 :128] @ Left
++ vdup.8 q1, ip
++ vld1.8 {d4[],d5[]}, [r1]
++ vhsub.u8 q0, q1
++ vmov.i8 q1, #128
++ veor q2, q1
++ vmov.i64 d16, #0xff
++ vqadd.s8 q0, q2
++ vld1.8 {q3}, [r1 :128] @ Top
++ mov r1, #16
++ veor q0, q1
++ vmov q1, q3
++ vext.8 q2, q0, q0, #1
++1:
++ vbit d2, d0, d16
++ vbit d6, d4, d16
++ vext.8 q0, q0, q0, #2
++ subs r1, #2
++ vst1.8 {q1}, [r0 :128], r3
++ vext.8 q2, q2, q2, #2
++ vst1.8 {q3}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vert_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
++ vld1.8 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3
++ lsl r3, #1
++ mov r1, #16
++1:
++ vst1.8 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.8 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
++ vld1.16 {d0 }, [r1 :64] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++
++ vst1.16 {d0 }, [r0 :64], r3
++ vst1.16 {d0 }, [r2 :64], r3
++ vst1.16 {d0 }, [r0 :64]
++ vst1.16 {d0 }, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #4
++1:
++ vst1.16 {q0 }, [r0 :128], r3
++ subs r1, #2
++ vst1.16 {q0 }, [r2 :128], r3
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q0 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #1
++ lsl r3, #2
++ mov r1, #8
++1:
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++@ ? Might be faster as simple arm
++
++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.32 {d0[0]}, [r1 :32] @ Top
++ add r1, r2, #3
++ vld1.8 {d1[]}, [r2]!
++ vdup.8 d2, ip
++ vmov.i8 d3, #128
++ vhsub.u8 d0, d2
++ veor d1, d3
++ vld1.8 {d2[]}, [r2]!
++ add ip, r0, r3
++ vqadd.s8 d0, d0, d1
++ lsl r3, #1
++ vld1.8 {d1[]}, [r2]
++ vld1.8 {d4[]}, [r1]
++ veor d0, d3
++ vst1.32 {d0[0]}, [r0 :32], r3
++ vst1.32 {d2[0]}, [ip :32], r3
++ vst1.32 {d1[0]}, [r0 :32]
++ vst1.32 {d4[0]}, [ip :32]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.8 {d0}, [r1 :64] @ Top
++ vmov.i8 d1, #128
++ vld1.8 {d2[]}, [r2]!
++ mov r1, #8-2
++ vdup.8 d3, ip
++ vhsub.u8 d0, d3
++ veor d2, d1
++ vqadd.s8 d0, d2
++ vld1.8 {d2[]}, [r2]!
++ veor d0, d1
++ vst1.8 {d0}, [r0], r3
++1:
++ vld1.8 {d0[]}, [r2]!
++ subs r1, #2
++ vst1.8 {d2}, [r0 :64], r3
++ vld1.8 {d2[]}, [r2]!
++ vst1.8 {d0}, [r0 :64], r3
++ bne 1b
++
++ vst1.8 {d2}, [r0 :64]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
++ ldrb ip, [r2, #-1] @ Top-left
++ vld1.8 {q0}, [r1 :64] @ Top
++ mov r1, #16-2
++ vld1.8 {d4[],d5[]}, [r2]!
++ vdup.8 q3, ip
++ vhsub.u8 q0, q3
++ vmov.i8 q1, #128
++ veor q2, q1
++ vqadd.s8 q0, q2
++ vld1.8 {d4[],d5[]}, [r2]!
++ veor q0, q1
++ vst1.8 {q0}, [r0], r3
++1:
++ vld1.8 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.8 {q2}, [r0 :64], r3
++ vld1.8 {d4[],d5[]}, [r2]!
++ vst1.8 {q0}, [r0 :64], r3
++ bne 1b
++
++ vst1.8 {q2}, [r0 :64]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
++ vld1.8 {d0[],d1[]}, [r2]!
++ add ip, r0, #16
++ mov r1, #32-2
++ vld1.8 {d2[],d3[]}, [r2]!
++ vst1.8 {q0}, [r0 :128], r3
++ vst1.8 {q0}, [ip :128], r3
++1:
++ vld1.8 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.8 {q1}, [r0 :128], r3
++ vst1.8 {q1}, [ip :128], r3
++ vld1.8 {d2[],d3[]}, [r2]!
++ vst1.8 {q0}, [r0 :128], r3
++ vst1.8 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.8 {q1}, [r0 :128]
++ vst1.8 {q1}, [ip :128]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
++ add r1, r2, #2
++ vld1.16 {d0[]}, [r2]
++ add r2, #4
++ vld1.16 {d1[]}, [r1]
++ add r1, #4
++ vld1.16 {d2[]}, [r2]
++A add r2, r0, r3, lsl #1
++T lsl r3, #1
++T add r2, r0, r3
++ vld1.16 {d3[]}, [r1]
++A lsl r3, #2
++T lsl r3, #1
++ vst1.16 {d0}, [r0 :64], r3
++ vst1.16 {d1}, [r2 :64], r3
++ vst1.16 {d2}, [r0 :64]
++ vst1.16 {d3}, [r2 :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
++ vld1.16 {d0[],d1[]}, [r2]!
++ lsl r3, #1
++ vld1.16 {d2[],d3[]}, [r2]!
++ mov r1, #8-2
++ vst1.16 {q0}, [r0 :64], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q1}, [r0 :64], r3
++ vld1.16 {d2[],d3[]}, [r2]!
++ vst1.16 {q0}, [r0 :64], r3
++ bne 1b
++
++ vst1.16 {q1}, [r0 :64]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
++ vld1.16 {d0[],d1[]}, [r2]!
++ lsl r3, #1
++ add ip, r0, #16
++ mov r1, #16-2
++ vld1.16 {d2[],d3[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q1}, [r0 :128], r3
++ vst1.16 {q1}, [ip :128], r3
++ vld1.16 {d2[],d3[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.16 {q1}, [r0 :128]
++ vst1.16 {q1}, [ip :128]
++ bx lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ 10 Bit
++@ Has clipping constants so 10-bit only but could easily be macroed up to
++@ 14-bit before we run out of bits
++
++
++@ ff_hevc_rpi_pred_vertical_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {d0}, [r2 :64] @ Left
++ vmov.i16 d2, #0
++ vld1.16 {d1[]}, [r1]
++T lsl r3, #1
++ vdup.16 d4, ip
++ vmov.i16 d3, #0x3ff
++ vld1.16 {d5}, [r1 :64] @ Top
++ vhsub.u16 d4, d0, d4
++ vmov.i64 d0, #0xffff
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vadd.i16 d1, d1, d4
++ vmov d6, d5
++ vmax.s16 d1, d1, d2
++ vmin.s16 d2, d1, d3
++ vmin.s16 d1, d1, d3
++ vbit d5, d1, d0
++A lsl r3, #2
++T lsl r3, #1
++ vshr.u64 d2, #16
++ vshr.u64 d1, #32
++ vbit d6, d2, d0
++ vst1.16 {d5}, [r0], r3
++ vshr.u64 d2, #32
++ vst1.16 {d6}, [r2], r3
++ vbit d5, d1, d0
++ vst1.16 {d5}, [r0]
++ vbit d6, d2, d0
++ vst1.16 {d6}, [r2]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {q0}, [r2 :128] @ Left
++ lsl r3, #1
++ vdup.16 q1, ip
++ vld1.16 {d4[],d5[]}, [r1]
++ vhsub.u16 q0, q0, q1
++ vmov.i16 q1, #0
++ vadd.i16 q0, q2
++ vmov.i16 q2, #0x3ff
++ vld1.16 {q3}, [r1 :128] @ Top
++ mov r1, #8
++ vmax.s16 q0, q1
++ vmov q1, q3
++ vmin.s16 q0, q2
++ vmov.i64 d16, #0xffff
++ vext.16 q2, q0, q0, #1
++1:
++ vbit d2, d0, d16
++ vbit d6, d4, d16
++ vext.16 q0, q0, q0, #2
++ subs r1, #2
++ vst1.16 {q1}, [r0 :128], r3
++ vext.16 q2, q2, q2, #2
++ vst1.16 {q3}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {q0-q1}, [r2 :128] @ Left
++T lsl r3, #1
++ vdup.16 q2, ip
++A add r2, r0, r3, lsl #1
++T add r2, r0, r3
++ vld1.16 {d6[],d7[]}, [r1]
++A lsl r3, #2
++T lsl r3, #1
++ vhsub.u16 q0, q2
++ vhsub.u16 q1, q2
++ vadd.i16 q0, q3
++ vadd.i16 q1, q3
++ vmov.i16 q2, #0
++ vld1.16 {q8-q9}, [r1 :128] @ Top
++ mov r1, #0
++ vmov.i16 q3, #0x3ff
++ vmax.s16 q0, q2
++ vmax.s16 q1, q2
++ vmin.s16 q0, q3
++ vmin.s16 q1, q3
++ vmov q10, q8
++ vmov q11, q9
++ vext.16 q2, q0, q1, #1
++ vext.16 q3, q1, q1, #1
++ vmov.i64 d24, #0xffff
++1:
++ vbit d16, d0, d24
++ vbit d20, d4, d24
++ vext.16 q0, q0, q0, #2
++ subs r1, #1<<30
++ vst1.16 {q8-q9}, [r0 :128], r3
++ vext.16 q2, q2, q2, #2
++ vst1.16 {q10-q11}, [r2 :128], r3
++ bne 1b
++1:
++ vbit d16, d2, d24
++ vbit d20, d6, d24
++ vext.16 q1, q1, q1, #2
++ subs r1, #1<<30
++ vst1.16 {q8-q9}, [r0 :128], r3
++ vext.16 q3, q3, q3, #2
++ vst1.16 {q10-q11}, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
++ vldm r1, { q0-q3 } @ Up
++ lsl r3, #1
++ mov r1, #32
++ add r2, r0, #32
++1:
++ vst1.16 {q0-q1}, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q2-q3}, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
++ vld1.16 {q0 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++
++ vst1.16 {q0 }, [r0 :128], r3
++ vst1.16 {q0 }, [r2 :128], r3
++ vst1.16 {q0 }, [r0 :128]
++ vst1.16 {q0 }, [r2 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
++ vld1.16 {q0, q1 }, [r1 :128] @ Up
++ add r2, r0, r3, lsl #2
++ lsl r3, #3
++ mov r1, #4
++1:
++ vst1.16 {q0, q1 }, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q0, q1 }, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
++ vldm r1, { q0-q3 } @ Up
++ lsl r3, #2
++ mov r1, #16
++ add r2, r0, #32
++1:
++ vst1.16 {q0-q1}, [r0 :128], r3
++ subs r1, #1
++ vst1.16 {q2-q3}, [r2 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++@ ff_hevc_rpi_pred_horizontal_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {d0}, [r1 :64] @ Top
++ vmov.i16 d1, #0
++ vld1.16 {d2[]}, [r2]!
++T lsl r3, #1
++ vdup.16 d3, ip
++ vmov.i16 d4, #0x3ff
++ vhsub.u16 d0, d3
++A add ip, r0, r3, lsl #1
++T add ip, r0, r3
++ vld1.16 {d3[]}, [r2]!
++A lsl r3, #2
++T lsl r3, #1
++ vadd.i16 d0, d2
++ vld1.16 {d2[]}, [r2]!
++ vmax.s16 d0, d1
++ vld1.16 {d1[]}, [r2]
++ vmin.s16 d0, d4
++ vst1.16 {d0}, [r0 :64], r3
++ vst1.16 {d3}, [ip :64], r3
++ vst1.16 {d2}, [r0 :64]
++ vst1.16 {d1}, [ip :64]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {q0}, [r1 :128] @ Top
++ lsl r3, #1
++ vdup.16 q1, ip
++ mov r1, #8-2
++ vhsub.u16 q0, q1
++ vld1.16 {d2[],d3[]}, [r2]!
++ vmov.i16 q2, #0
++ vadd.i16 q0, q1
++ vmov.i16 q1, #0x3ff
++ vmax.s16 q0, q2
++ vld1.16 {d4[],d5[]}, [r2]!
++ vmin.s16 q0, q1
++ vst1.16 {q0}, [r0 :128], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q2}, [r0 :128], r3
++ vld1.16 {d4[],d5[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], r3
++ bne 1b
++
++ vst1.16 {q2}, [r0 :128]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
++ ldrh ip, [r2, #-2] @ Top-left
++ vld1.16 {q0-q1}, [r1 :128] @ Top
++ lsl r3, #1
++ vdup.16 q2, ip
++ add ip, r0, r3
++ vhsub.u16 q0, q2
++ add ip, #16
++ vhsub.u16 q1, q2
++ mov r1, #16-2
++ vld1.16 {d4[],d5[]}, [r2]!
++ vmov.i16 q3, #0
++ vadd.u16 q0, q2
++ vadd.i16 q1, q2
++ vmov.i16 q2, #0x3ff
++ vmax.s16 q0, q3
++ vmax.s16 q1, q3
++ vld1.16 {d6[],d7[]}, [r2]!
++ vmin.s16 q0, q2
++ vmin.s16 q1, q2
++ vst1.16 {q0-q1}, [r0 :128], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q3}, [r0 :128], r3
++ vst1.16 {q3}, [ip :128], r3
++ vld1.16 {d6[],d7[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.16 {q3}, [r0 :128]
++ vst1.16 {q3}, [ip :128]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
++ vld1.16 {d0[],d1[]}, [r2]!
++ add ip, r0, #16
++ push {lr}
++ mov lr, #32
++ vld1.16 {d2[],d3[]}, [r2]!
++ lsl r3, #1
++ vst1.16 {q0}, [r0 :128], lr
++ sub r3, #32
++ vst1.16 {q0}, [ip :128], lr
++ mov r1, #32-2
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++1:
++ vld1.16 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.16 {q1}, [r0 :128], lr
++ vst1.16 {q1}, [ip :128], lr
++ vst1.16 {q1}, [r0 :128], r3
++ vst1.16 {q1}, [ip :128], r3
++ vld1.16 {d2[],d3[]}, [r2]!
++ vst1.16 {q0}, [r0 :128], lr
++ vst1.16 {q0}, [ip :128], lr
++ vst1.16 {q0}, [r0 :128], r3
++ vst1.16 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.16 {q1}, [r0 :128], lr
++ vst1.16 {q1}, [ip :128], lr
++ vst1.16 {q1}, [r0 :128]
++ vst1.16 {q1}, [ip :128]
++ pop {pc}
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
++ add r1, r2, #4
++ vld1.32 {d0[],d1[]}, [r2]
++ add r2, #8
++ vld1.32 {d2[],d3[]}, [r1]
++ add r1, #8
++ vld1.32 {d4[],d5[]}, [r2]
++A add r2, r0, r3, lsl #2
++T lsl r3, #2
++T add r2, r0, r3
++ vld1.32 {d6[],d7[]}, [r1]
++A lsl r3, #3
++T lsl r3, #1
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q1}, [r2 :128], r3
++ vst1.32 {q2}, [r0 :128]
++ vst1.32 {q3}, [r2 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
++ vld1.32 {d0[],d1[]}, [r2]!
++ lsl r3, #2
++ add ip, r0, #16
++ mov r1, #8-2
++ vld1.32 {d2[],d3[]}, [r2]!
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q0}, [ip :128], r3
++1:
++ vld1.32 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.32 {q1}, [r0 :128], r3
++ vst1.32 {q1}, [ip :128], r3
++ vld1.32 {d2[],d3[]}, [r2]!
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.32 {q1}, [r0 :128]
++ vst1.32 {q1}, [ip :128]
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
++ vld1.32 {d0[],d1[]}, [r2]!
++ add ip, r0, #16
++ push {lr}
++ mov lr, #32
++ vld1.32 {d2[],d3[]}, [r2]!
++ lsl r3, #2
++ vst1.32 {q0}, [r0 :128], lr
++ sub r3, #32
++ vst1.32 {q0}, [ip :128], lr
++ mov r1, #16-2
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q0}, [ip :128], r3
++1:
++ vld1.32 {d0[],d1[]}, [r2]!
++ subs r1, #2
++ vst1.32 {q1}, [r0 :128], lr
++ vst1.32 {q1}, [ip :128], lr
++ vst1.32 {q1}, [r0 :128], r3
++ vst1.32 {q1}, [ip :128], r3
++ vld1.32 {d2[],d3[]}, [r2]!
++ vst1.32 {q0}, [r0 :128], lr
++ vst1.32 {q0}, [ip :128], lr
++ vst1.32 {q0}, [r0 :128], r3
++ vst1.32 {q0}, [ip :128], r3
++ bne 1b
++
++ vst1.32 {q1}, [r0 :128], lr
++ vst1.32 {q1}, [ip :128], lr
++ vst1.32 {q1}, [r0 :128]
++ vst1.32 {q1}, [ip :128]
++ pop {pc}
++endfunc
++
++
++
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+@@ -0,0 +1,1043 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ Planar intra pred (8.4.4.2.4)
++@
++@ predSamples[ x ][ y ] =
++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
++@ ( x + 1 ) * p[ nTbS ][ -1 ] +
++@ ( nTbS - 1 - y ) * p[ x ][ -1 ] +
++@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
++
++@ All 10-bit functions would work with 9
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_8, export=1
++
++ vld1.8 {d0}, [r1] @ Top
++ adr ip, nb_3_0_1_4
++ vld1.8 {d1}, [r2] @ Left
++ vmov.i64 d2, #0xffffffff
++ vldr d3, [ip, #8] @ {1,2,3,4,1,2,3,4}
++ add r1, r0, r3
++ vdup.32 d4, d0[0] @ {t0,t1,t2,t3,t0,t1,t2,t3}
++ vdup.8 d0, d0[4] @ {t4,t4,t4,t4,t4,t4,t4,t4}
++ vdup.8 d5, d1[4] @ {l4,l4,l4,l4,l4,l4,l4,l4}
++ vdup.8 d6, d1[0] @ {l0,l0,l0,l0,l0,l0,l0,l0}
++ vshll.u8 q8, d4, #2
++ lsl r3, #1
++ vsubl.u8 q2, d5, d4
++ vmlal.u8 q8, d0, d3
++ vld1.8 {d0}, [ip] @ {3,2,1,0,3,2,1,0}
++ vdup.8 d7, d1[1] @ {l1,l1,l1,l1,l1,l1,l1,l1}
++ vshl.s16 q9, q2, #1
++ vbif d6, d7, d2 @ {l0,l0,l0,l0,l1,l1,l1,l1}
++ vadd.i16 d16, d4
++ vdup.8 d7, d1[2] @ {l2,l2,l2,l2,l2,l2,l2,l2}
++ vadd.i16 d17, d18
++ vdup.8 d1, d1[3] @ {l3,l3,l3,l3,l3,l3,l3,l3}
++ vadd.i16 q2, q8, q9
++ vmlal.u8 q8, d0, d6
++ vbif d7, d1, d2 @ {l2,l2,l2,l2,l3,l3,l3,l3}
++ vmlal.u8 q2, d0, d7
++ vrshrn.i16 d0, q8, #3
++ vst1.32 d0[0], [r0 :32], r3
++ vst1.32 d0[1], [r1 :32], r3
++ vrshrn.i16 d0, q2, #3
++ vst1.32 d0[0], [r0 :32]
++ vst1.32 d0[1], [r1 :32]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_10, export=1
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ vld1.16 {q0}, [r1 :64] @ Top
++ adr ip, nbh_3_0_1_4
++ vldr d2, [r2, #8] @ Left (lower)
++ vldr d3, [ip, #8] @ {1,2,3,4}
++T lsl r3, #1
++ vshl.s16 d4, d0, #2
++ vdup.16 d1, d1[0] @ {t4,t4,t4,t4}
++ vldr d5, [r2] @ Left (upper)
++ vdup.16 d2, d2[0] @ {l4,l4,l4,l4}
++ vldr d6, [ip] @ {3,2,1,0}
++ vmla.i16 d4, d3, d1 @ Acc set up
++ vsub.i16 d0, d2, d0 @ Add set up
++ vmov d7, d6
++ vdup.16 d2, d5[0]
++ vdup.16 d3, d5[1]
++ vdup.16 d16, d5[2]
++ vadd.i16 d18, d0, d4
++ vshl.s16 d0, #1 @ x2
++ vadd.i16 d19, d0, d4
++ vdup.16 d17, d5[3]
++ vadd.i16 d4, d0, d18
++A add r1, r0, r3, lsl #1
++T add r1, r0, r3
++ vadd.i16 d5, d0, d19
++A lsl r3, #2
++T lsl r3, #1
++ vmla.i16 q9, q1, q3
++ vmla.i16 q2, q8, q3
++ vrshr.u16 q0, q9, #3
++ vst1.16 {d0}, [r0], r3
++ vrshr.u16 d2, d4, #3
++ vst1.16 {d1}, [r1], r3
++ vrshr.u16 d3, d5, #3
++ vst1.16 {d2}, [r0]
++ vst1.16 {d3}, [r1]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_8, export=1
++
++ vld1.8 {q0}, [r1] @ Top
++ adr ip, nb_7_0_1_8
++ vldr d2, [r2, #8] @ Left (lower)
++ mov r1, #8
++ vldr d3, [ip, #8] @ {1,2,3,4,5,6,7,8}
++ vshll.u8 q2, d0, #3
++ vdup.8 d1, d1[0] @ {t8,t8,t8,t8,t8,t8,t8,t8}
++ vdup.8 d2, d2[0] @ {l8,l8,l8,l8,l8,l8,l8,l8}
++ vldr d6, [r2] @ Left (upper)
++ vmlal.u8 q2, d3, d1
++ vsubl.u8 q0, d2, d0
++ vldr d7, [ip] @ {7,6,5,4,3,2,1,0}
++
++@ u8 7..0 [1] d7
++@ u8 left[y] [1] d6
++@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1]
++
++ vdup.8 d2, d6[0]
++ vadd.i16 q2, q0
++ vdup.8 d3, d6[1]
++ vadd.i16 q8, q2, q0
++1:
++ vmlal.u8 q2, d7, d2
++ subs r1, #2
++ vadd.i16 q9, q8, q0
++ vmlal.u8 q8, d7, d3
++ vdup.8 d2, d6[2]
++ vdup.8 d3, d6[3]
++ vrshrn.i16 d20, q2, #4
++ vshr.u64 d6, #16
++ vmov q2, q9
++ vst1.8 {d20}, [r0], r3
++ vrshrn.i16 d20, q8, #4
++ vadd.i16 q8, q2, q0
++ vst1.8 {d20}, [r0], r3
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_10, export=1
++
++ adr ip, nb_7_0_1_8
++ vld1.16 {q0}, [r1 :128]! @ Top (left)
++ lsl r3, #1
++ vld1.16 {q1}, [ip :128] @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
++ add ip, r2, #16
++ vld1.16 {d4[],d5[]}, [r1] @ Top (right)
++ mov r1, #8-2
++ vshl.s16 q3, q0, #3
++ vmovl.u8 q8, d3 @ {1,2,3,4,5,6,7,8}
++ vld1.16 {d18[],d19[]}, [ip] @ Left (lower)
++ vmla.i16 q3, q8, q2 @ Acc set up
++ vsub.i16 q0, q9, q0 @ Add set up
++ vmovl.u8 q1, d2 @ {7,6,5,4,3,2,1,0}
++ vadd.i16 q2, q3, q0
++
++@ u16 7..0 [1] q1
++@ u32 left[y] [1] [r2]
++@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1]
++
++ vld1.16 {d6[],d7[]}, [r2]!
++ vadd.i16 q8, q2, q0
++ vld1.16 {d18[],d19[]}, [r2]!
++ vmla.i16 q2, q1, q3
++ vadd.i16 q3, q8, q0
++ vmla.i16 q8, q1, q9
++1:
++ vrshr.u16 q9, q2, #4
++ subs r1, #2
++ vmov q2, q3
++ vrshr.u16 q10, q8, #4
++ vld1.16 {d6[],d7[]}, [r2]!
++ vst1.16 {q9}, [r0 :128], r3
++ vadd.i16 q8, q2, q0
++ vld1.16 {d18[],d19[]}, [r2]!
++ vmla.i16 q2, q1, q3
++ vadd.i16 q3, q8, q0
++ vmla.i16 q8, q1, q9
++ vst1.16 {q10}, [r0 :128], r3
++ bne 1b
++
++ vrshr.u16 q9, q2, #4
++ add r3, r0
++ vrshr.u16 q10, q8, #4
++ vst1.16 {q9}, [r0 :128]
++ vst1.16 {q10}, [r3 :128]
++
++ bx lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ Data - has to be in two lumps to ensure we can always reach using adr
++
++ .balign 64
++
++nb_31_0_1_32:
++ .byte 31, 30, 29, 28, 27, 26, 25, 24
++ .byte 23, 22, 21, 20, 19, 18, 17, 16
++nb_15_0_1_16:
++ .byte 15, 14, 13, 12, 11, 10, 9, 8
++ .byte 7, 6, 5, 4, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 5, 6, 7, 8
++ .byte 9, 10, 11, 12, 13, 14, 15, 16
++ .byte 17, 18, 19, 20, 21, 22, 23, 24
++ .byte 25, 26, 27, 28, 29, 30, 31, 32
++
++ @ should be back on a 64-byte boundary here
++
++ @ These could be extracted from the above array, but separate out
++ @ out for better (16 byte) alignment
++nb_3_0_1_4:
++ .byte 3, 2, 1, 0, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 1, 2, 3, 4
++nb_7_0_1_8:
++ .byte 7, 6, 5, 4, 3, 2, 1, 0
++ .byte 1, 2, 3, 4, 5, 6, 7, 8
++nbh_3_0_1_4:
++ .short 3, 2, 1, 0, 1, 2, 3, 4
++
++@------------------------------------------------------------------------------
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_8, export=1
++
++ adr ip, nb_15_0_1_16 + 16
++ vld1.8 {q0}, [r1 :128]! @ Top (left)
++ add r2, #16
++ vld1.8 {q1}, [ip: 128] @ {1,2,3...16}
++ vld1.8 {d4[]}, [r1] @ Top (right)
++ sub ip, #16
++ vshll.u8 q3, d0, #4
++ mov r1, #16
++ vshll.u8 q8, d1, #4
++ vld1.8 {d5[]}, [r2] @ Left (lower)
++ sub r2, #16
++ vmlal.u8 q3, d2, d4
++ vmlal.u8 q8, d3, d4 @ Acc set up
++ vsubl.u8 q1, d5, d0
++ vsubl.u8 q0, d5, d1 @ Add set up
++ vld1.8 {q2}, [ip :128] @ {15,14,13...0}
++
++@ u8 15..0 [1] q2
++@ u8 left[y] [1] [r2]
++@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1]
++
++ vadd.i16 q3, q1
++ vadd.i16 q8, q0
++1:
++ vadd.i16 q10, q3, q1
++ subs r1, #2
++ vld1.8 {d18[]}, [r2]!
++ vadd.i16 q11, q8, q0
++ vld1.8 {d19[]}, [r2]!
++ vmlal.u8 q3, d4, d18
++ vmlal.u8 q8, d5, d18
++ vadd.i16 q12, q10, q1
++ vmlal.u8 q10, d4, d19
++ vadd.i16 q13, q11, q0
++ vmlal.u8 q11, d5, d19
++ vrshrn.u16 d18, q3, #5
++ vrshrn.u16 d19, q8, #5
++ vmov q3, q12
++ vst1.8 {q9}, [r0 :128], r3
++ vrshrn.u16 d18, q10, #5
++ vrshrn.u16 d19, q11, #5
++ vmov q8, q13
++ vst1.8 {q9}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_10, export=1
++
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr ip, nb_15_0_1_16 + 16
++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
++ add r2, #32
++ vld1.8 {q2}, [ip :128] @ {1,2,3...16}
++ lsl r3, #1
++ vld1.16 {d6[],d7[]}, [r1] @ Top (right)
++ sub ip, #16
++ vmovl.u8 q8, d4
++ mov r1, #16
++ vshl.i16 q9, q0, #4
++ vmovl.u8 q2, d5
++ vshl.i16 q10, q1, #4
++ vld1.16 {d22[],d23[]}, [r2] @ Left (lower)
++ sub r2, #32
++ vld1.8 {q12}, [ip] @ {15,14,13...0}
++ vmla.i16 q9, q8, q3
++ vmla.i16 q10, q2, q3 @ Acc set up
++ vsub.i16 q0, q11, q0
++ vsub.i16 q1, q11, q1 @ Add set up
++ vadd.i16 q2, q9, q0
++ vadd.i16 q3, q10, q1
++ vmovl.u8 q8, d24
++ vmovl.u8 q9, d25
++
++@ u16 15..0 [2] q8,q9
++@ u32 left[y] [2] [r2]
++@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1]
++
++1:
++ vadd.i16 q10, q2, q0
++ subs r1, #2
++ vld1.16 {d24[],d25[]}, [r2]!
++ vadd.i16 q11, q3, q1
++ vld1.16 {d28[],d29[]}, [r2]!
++ vmla.i16 q2, q8, q12
++ vmla.i16 q3, q9, q12
++ vadd.i16 q12, q10, q0
++ vmla.i16 q10, q8, q14
++ vadd.i16 q13, q11, q1
++ vmla.i16 q11, q9, q14
++ vrshr.u16 q14, q2, #5
++ vrshr.u16 q15, q3, #5
++ vmov q2, q12
++ vst1.16 {q14-q15}, [r0 :128], r3
++ vrshr.u16 q14, q10, #5
++ vrshr.u16 q15, q11, #5
++ vmov q3, q13
++ vst1.16 {q14-q15}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_8, export=1
++
++ vld1.8 {q0-q1}, [r1 :128]! @ Top (left)
++ adr ip, nb_31_0_1_32 + 32
++ vpush {d8-d12}
++ vld1.8 {q2-q3}, [ip :128] @ {1,2,3...32}
++ add r2, #32
++ vld1.8 {d8[]}, [r1] @ Top (right)
++ sub ip, #32
++ vshll.u8 q8, d0, #5
++ mov r1, #32
++ vld1.8 {d9[]}, [r2] @ Left (lower)
++ sub r2, #32
++ vshll.u8 q9, d1, #5
++ vshll.u8 q10, d2, #5
++ vshll.u8 q11, d3, #5
++ vmlal.u8 q8, d4, d8
++ vsubl.u8 q12, d9, d0
++ vmlal.u8 q9, d5, d8
++ vsubl.u8 q13, d9, d1
++ vmlal.u8 q10, d6, d8
++ vsubl.u8 q14, d9, d2
++ vmlal.u8 q11, d7, d8 @ Acc set up
++ vsubl.u8 q15, d9, d3 @ Add set up
++ vadd.i16 q8, q12
++ vadd.i16 q9, q13
++ vadd.i16 q10, q14
++ vadd.i16 q11, q15
++ vld1.8 {q4-q5}, [ip :128] @ {31,30,29...0}
++
++@ u8 31..0 [2] q4,q5
++@ u8 left[y] [2] [r2]
++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1]
++
++ vld1.8 {d12[]}, [r2]!
++ vadd.i16 q0, q8, q12
++ b 2f
++1:
++ vld1.8 {d12[]}, [r2]!
++ vrshrn.u16 d3, q1, #6
++ vrshrn.u16 d2, q0, #6
++ vadd.i16 q0, q8, q12
++ vrshrn.u16 d4, q2, #6
++ vrshrn.u16 d5, q3, #6
++ vst1.8 {q1-q2}, [r0 :128], r3
++2: vadd.i16 q1, q9, q13
++ subs r1, #2
++ vadd.i16 q2, q10, q14
++ vadd.i16 q3, q11, q15
++ vmlal.u8 q8, d8, d12
++ vmlal.u8 q9, d9, d12
++ vmlal.u8 q10, d10, d12
++ vmlal.u8 q11, d11, d12
++ vld1.8 {d12[]}, [r2]!
++ vrshrn.u16 d19, q9, #6
++ vrshrn.u16 d18, q8, #6
++ vadd.i16 q8, q0, q12
++ vrshrn.u16 d20, q10, #6
++ vrshrn.u16 d21, q11, #6
++ vst1.8 {q9-q10}, [r0 :128], r3
++ vadd.i16 q9, q1, q13
++ vadd.i16 q10, q2, q14
++ vadd.i16 q11, q3, q15
++ vmlal.u8 q0, d8, d12
++ vmlal.u8 q1, d9, d12
++ vmlal.u8 q2, d10, d12
++ vmlal.u8 q3, d11, d12
++
++ bne 1b
++
++ vpop {d8-d12}
++
++ vrshrn.u16 d3, q1, #6
++ vrshrn.u16 d2, q0, #6
++ vrshrn.u16 d4, q2, #6
++ vrshrn.u16 d5, q3, #6
++ vst1.8 {q1-q2}, [r0 :128]
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_10, export=1
++
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
++ adr ip, nb_31_0_1_32 + 32
++ vpush {q4-q7}
++ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre)
++ add r2, #64
++ vld1.8 {q14-q15}, [ip :128] @ {1,2,3...32}
++T lsl r3, #1
++ vld1.16 {d8[],d9[]}, [r1] @ Top (right)
++ sub ip, #32
++ vmovl.u8 q12, d28
++ mov r1, #32
++ vmovl.u8 q13, d29
++ vld1.8 {q6-q7}, [ip :128] @ {31,30,29...0}
++ vmovl.u8 q14, d30
++ vmovl.u8 q15, d31
++ vld1.16 {d10[],d11[]}, [r2] @ Left (lower)
++ sub r2, #64
++ vshl.i16 q8, q0, #5
++ vshl.i16 q9, q1, #5
++ vshl.i16 q10, q2, #5
++ vshl.i16 q11, q3, #5
++ vmla.i16 q8, q12, q4
++ vsub.i16 q0, q5, q0
++ vmla.i16 q9, q13, q4
++ vsub.i16 q1, q5, q1
++ vmla.i16 q10, q14, q4
++ vmov.u16 ip, d0[0]
++ vsub.i16 q2, q5, q2
++ vmla.i16 q11, q15, q4 @ Acc set up
++ vsub.i16 q3, q5, q3 @ Add set up
++ vadd.i16 q8, q0
++ vadd.i16 q9, q1
++ vadd.i16 q10, q2
++ vadd.i16 q11, q3
++ vmovl.u8 q4, d12
++ vmovl.u8 q5, d13
++ vmovl.u8 q6, d14
++ vmovl.u8 q7, d15
++
++@ u16 31..0 [4] q4-q7
++@ u16 left[y] [4] [r2]
++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1]
++
++ vadd.i16 q12, q8, q0
++A sub r0, r0, r3, lsl #1
++T sub r0, r3
++1:
++ vld1.16 {d0[0]}, [r2]!
++A add r0, r0, r3, lsl #1
++T add r0, r3
++ vadd.i16 q13, q9, q1
++ subs r1, #2
++ vadd.i16 q14, q10, q2
++ vadd.i16 q15, q11, q3
++ vmla.i16 q8, q4, d0[0]
++ vmla.i16 q9, q5, d0[0]
++ vmla.i16 q10, q6, d0[0]
++ vmla.i16 q11, q7, d0[0]
++ vmov.16 d0[0], ip
++ vrshr.u16 q8, #6
++ vrshr.u16 q9, #6
++ vrshr.u16 q10, #6
++ vrshr.u16 q11, #6
++ vstm r0, {q8-q11}
++ vadd.i16 q8, q12, q0
++A add r0, r0, r3, lsl #1
++T add r0, r3
++ vld1.16 {d0[0]}, [r2]!
++ vadd.i16 q9, q13, q1
++ vadd.i16 q10, q14, q2
++ vadd.i16 q11, q15, q3
++ vmla.i16 q12, q4, d0[0]
++ vmla.i16 q13, q5, d0[0]
++ vmla.i16 q14, q6, d0[0]
++ vmla.i16 q15, q7, d0[0]
++ vmov.16 d0[0], ip
++ vrshr.u16 q12, #6
++ vrshr.u16 q13, #6
++ vrshr.u16 q14, #6
++ vrshr.u16 q15, #6
++ vstm r0, {q12-q15}
++ vadd.i16 q12, q8, q0
++ bne 1b
++
++ vpop {q4-q7}
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
++
++ vld1.8 {q0}, [r1] @ Top
++ adr ip, nbx2_3_0_1_4
++ vldr d2, [r2, #8] @ Left (lower)
++ mov r1, #4
++ vldr d3, [ip, #8] @ {1,1,2,2,3,3,4,4}
++ lsl r3, #1
++ vshll.u8 q2, d0, #2
++ vdup.16 d1, d1[0] @ {t4,t4,t4,t4,t4,t4,t4,t4}
++ vdup.16 d2, d2[0] @ {l4,l4,l4,l4,l4,l4,l4,l4}
++ vldr d6, [r2] @ Left (upper)
++ vmlal.u8 q2, d3, d1
++ vsubl.u8 q0, d2, d0
++ vldr d7, [ip] @ {3,3,2,2,1,1,0,0}
++
++@ u8 3..0 [1] d7
++@ u8 left[y] [1] d6
++@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1]
++
++ vdup.16 d2, d6[0]
++ vadd.i16 q2, q0
++ vdup.16 d3, d6[1]
++ vadd.i16 q8, q2, q0
++1:
++ vmlal.u8 q2, d7, d2
++ subs r1, #2
++ vadd.i16 q9, q8, q0
++ vmlal.u8 q8, d7, d3
++ vdup.16 d2, d6[2]
++ vdup.16 d3, d6[3]
++ vrshrn.i16 d20, q2, #3
++ vmov q2, q9
++ vst1.8 {d20}, [r0], r3
++ vrshrn.i16 d20, q8, #3
++ vadd.i16 q8, q2, q0
++ vst1.8 {d20}, [r0], r3
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
++
++ adr ip, nbx2_3_0_1_4
++ vld1.16 {q0}, [r1 :128]! @ Top (left)
++ lsl r3, #2
++ vld1.16 {q1}, [ip :128] @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
++ add ip, r2, #16
++ vld1.32 {d4[],d5[]}, [r1] @ Top (right)
++ vshl.s16 q3, q0, #2
++ vmovl.u8 q8, d3 @ {1,1,2,2,3,3,4,4}
++ vld1.32 {d18[],d19[]}, [ip] @ Left (lower)
++ vmla.i16 q3, q8, q2 @ Acc set up
++ vsub.i16 q0, q9, q0 @ Add set up
++ vmovl.u8 q1, d2 @ {3,3,2,2,1,1,0,0}
++ vadd.i16 q2, q3, q0
++
++@ u16 3..0 [1] q1
++@ u32 left[y] [1] [r2]
++@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1]
++
++ vld1.32 {d6[],d7[]}, [r2]!
++ vadd.i16 q8, q2, q0
++ vld1.32 {d18[],d19[]}, [r2]!
++ vmla.i16 q2, q1, q3
++ vadd.i16 q3, q8, q0
++ vmla.i16 q8, q1, q9
++
++ vrshr.u16 q9, q2, #3
++ vmov q2, q3
++ vrshr.u16 q10, q8, #3
++ vld1.32 {d6[],d7[]}, [r2]!
++ vst1.16 {q9}, [r0 :128], r3
++ vadd.i16 q8, q2, q0
++ vld1.32 {d18[],d19[]}, [r2]!
++ vmla.i16 q2, q1, q3
++ vadd.i16 q3, q8, q0
++ vmla.i16 q8, q1, q9
++ vst1.16 {q10}, [r0 :128], r3
++
++ vrshr.u16 q9, q2, #3
++ add r3, r0
++ vrshr.u16 q10, q8, #3
++ vst1.16 {q9}, [r0 :128]
++ vst1.16 {q10}, [r3 :128]
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
++
++ adr ip, nbx2_7_0_1_8 + 16
++ vld1.8 {q0}, [r1 :128]! @ Top (left)
++ add r2, #16
++ vld1.8 {q1}, [ip: 128] @ {1,1,2,2,3,3...8,8}
++ lsl r3, #1
++ vld1.16 {d4[]}, [r1] @ Top (right)
++ sub ip, #16
++ vshll.u8 q3, d0, #3
++ mov r1, #8
++ vshll.u8 q8, d1, #3
++ vld1.16 {d5[]}, [r2] @ Left (lower)
++ sub r2, #16
++ vmlal.u8 q3, d2, d4
++ vmlal.u8 q8, d3, d4 @ Acc set up
++ vsubl.u8 q1, d5, d0
++ vsubl.u8 q0, d5, d1 @ Add set up
++ vld1.8 {q2}, [ip :128] @ {7,7,6,6,5,5...0,0}
++
++@ u8 7..0 [1] q2
++@ u8 left[y] [1] [r2]
++@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1]
++
++ vadd.i16 q3, q1
++ vadd.i16 q8, q0
++1:
++ vadd.i16 q10, q3, q1
++ subs r1, #2
++ vld1.16 {d18[]}, [r2]!
++ vadd.i16 q11, q8, q0
++ vld1.16 {d19[]}, [r2]!
++ vmlal.u8 q3, d4, d18
++ vmlal.u8 q8, d5, d18
++ vadd.i16 q12, q10, q1
++ vmlal.u8 q10, d4, d19
++ vadd.i16 q13, q11, q0
++ vmlal.u8 q11, d5, d19
++ vrshrn.u16 d18, q3, #4
++ vrshrn.u16 d19, q8, #4
++ vmov q3, q12
++ vst1.8 {q9}, [r0 :128], r3
++ vrshrn.u16 d18, q10, #4
++ vrshrn.u16 d19, q11, #4
++ vmov q8, q13
++ vst1.8 {q9}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ Data - has to be in two lumps to ensure we can always reach using adr
++
++ .balign 64
++
++nbx2_15_0_1_16:
++ .byte 15, 15, 14, 14, 13, 13, 12, 12
++ .byte 11, 11, 10, 10, 9, 9, 8, 8
++nbx2_7_0_1_8:
++ .byte 7, 7, 6, 6, 5, 5, 4, 4
++ .byte 3, 3, 2, 2, 1, 1, 0, 0
++ .byte 1, 1, 2, 2, 3, 3, 4, 4
++ .byte 5, 5, 6, 6, 7, 7, 8, 8
++ .byte 9, 9, 10, 10, 11, 11, 12, 12
++ .byte 13, 13, 14, 14, 15, 15, 16, 16
++
++ @ should be back on a 64-byte boundary here
++
++nbx2_3_0_1_4:
++ .byte 3, 3, 2, 2, 1, 1, 0, 0
++ .byte 1, 1, 2, 2, 3, 3, 4, 4
++
++@------------------------------------------------------------------------------
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
++
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ adr ip, nbx2_7_0_1_8 + 16
++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
++ add r2, #32
++ vld1.8 {q2}, [ip :128] @ {1,1,2,2,3,3...8,8}
++ lsl r3, #2
++ vld1.32 {d6[],d7[]}, [r1] @ Top (right)
++ sub ip, #16
++ vmovl.u8 q8, d4
++ mov r1, #8
++ vshl.i16 q9, q0, #3
++ vmovl.u8 q2, d5
++ vshl.i16 q10, q1, #3
++ vld1.32 {d22[],d23[]}, [r2] @ Left (lower)
++ sub r2, #32
++ vld1.8 {q12}, [ip] @ {7,7,6,6,5,5...0,0}
++ vmla.i16 q9, q8, q3
++ vmla.i16 q10, q2, q3 @ Acc set up
++ vsub.i16 q0, q11, q0
++ vsub.i16 q1, q11, q1 @ Add set up
++ vadd.i16 q2, q9, q0
++ vadd.i16 q3, q10, q1
++ vmovl.u8 q8, d24
++ vmovl.u8 q9, d25
++
++@ u16 7..0 [2] q8,q9
++@ u32 left[y] [2] [r2]
++@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1]
++
++1:
++ vadd.i16 q10, q2, q0
++ subs r1, #2
++ vld1.32 {d24[],d25[]}, [r2]!
++ vadd.i16 q11, q3, q1
++ vld1.32 {d28[],d29[]}, [r2]!
++ vmla.i16 q2, q8, q12
++ vmla.i16 q3, q9, q12
++ vadd.i16 q12, q10, q0
++ vmla.i16 q10, q8, q14
++ vadd.i16 q13, q11, q1
++ vmla.i16 q11, q9, q14
++ vrshr.u16 q14, q2, #4
++ vrshr.u16 q15, q3, #4
++ vmov q2, q12
++ vst1.16 {q14-q15}, [r0 :128], r3
++ vrshr.u16 q14, q10, #4
++ vrshr.u16 q15, q11, #4
++ vmov q3, q13
++ vst1.16 {q14-q15}, [r0 :128], r3
++ bne 1b
++
++ bx lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_8
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
++
++ vld1.8 {q0-q1}, [r1 :128]! @ Top (left)
++ adr ip, nbx2_15_0_1_16 + 32
++ vpush {d8-d12}
++ vld1.8 {q2-q3}, [ip :128] @ {1,1,2,2,3,3...16,16}
++ add r2, #32
++ vld1.16 {d8[]}, [r1] @ Top (right)
++ sub ip, #32
++ vshll.u8 q8, d0, #4
++ mov r1, #16
++ vld1.16 {d9[]}, [r2] @ Left (lower)
++ sub r2, #32
++ vshll.u8 q9, d1, #4
++ lsl r3, #1
++ vshll.u8 q10, d2, #4
++ vshll.u8 q11, d3, #4
++ vmlal.u8 q8, d4, d8
++ vsubl.u8 q12, d9, d0
++ vmlal.u8 q9, d5, d8
++ vsubl.u8 q13, d9, d1
++ vmlal.u8 q10, d6, d8
++ vsubl.u8 q14, d9, d2
++ vmlal.u8 q11, d7, d8 @ Acc set up
++ vsubl.u8 q15, d9, d3 @ Add set up
++ vadd.i16 q8, q12
++ vadd.i16 q9, q13
++ vadd.i16 q10, q14
++ vadd.i16 q11, q15
++ vld1.8 {q4-q5}, [ip :128] @ {15,15,14,14,13,13...0,0}
++
++@ u8 15..0 [2] q4,q5
++@ u8 left[y] [2] [r2]
++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1]
++
++ vld1.16 {d12[]}, [r2]!
++ vadd.i16 q0, q8, q12
++ b 2f
++1:
++ vld1.16 {d12[]}, [r2]!
++ vrshrn.u16 d3, q1, #5
++ vrshrn.u16 d2, q0, #5
++ vadd.i16 q0, q8, q12
++ vrshrn.u16 d4, q2, #5
++ vrshrn.u16 d5, q3, #5
++ vst1.8 {q1-q2}, [r0 :128], r3
++2: vadd.i16 q1, q9, q13
++ subs r1, #2
++ vadd.i16 q2, q10, q14
++ vadd.i16 q3, q11, q15
++ vmlal.u8 q8, d8, d12
++ vmlal.u8 q9, d9, d12
++ vmlal.u8 q10, d10, d12
++ vmlal.u8 q11, d11, d12
++ vld1.16 {d12[]}, [r2]!
++ vrshrn.u16 d19, q9, #5
++ vrshrn.u16 d18, q8, #5
++ vadd.i16 q8, q0, q12
++ vrshrn.u16 d20, q10, #5
++ vrshrn.u16 d21, q11, #5
++ vst1.8 {q9-q10}, [r0 :128], r3
++ vadd.i16 q9, q1, q13
++ vadd.i16 q10, q2, q14
++ vadd.i16 q11, q3, q15
++ vmlal.u8 q0, d8, d12
++ vmlal.u8 q1, d9, d12
++ vmlal.u8 q2, d10, d12
++ vmlal.u8 q3, d11, d12
++
++ bne 1b
++
++ vpop {d8-d12}
++
++ vrshrn.u16 d3, q1, #5
++ vrshrn.u16 d2, q0, #5
++ vrshrn.u16 d4, q2, #5
++ vrshrn.u16 d5, q3, #5
++ vst1.8 {q1-q2}, [r0 :128]
++
++ bx lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_10
++@ uint8_t *_src, [r0]
++@ const uint8_t *_top, [r1]
++@ const uint8_t *_left, [r2]
++@ ptrdiff_t stride) [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
++
++ @ Load from bytes & expand later - at the very least this uses less
++ @ memory than having a short table
++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
++ adr ip, nbx2_15_0_1_16 + 32
++ vpush {q4-q7}
++ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre)
++ add r2, #64
++ vld1.8 {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
++T lsl r3, #2
++ vld1.32 {d8[],d9[]}, [r1] @ Top (right)
++ sub ip, #32
++ vmovl.u8 q12, d28
++ mov r1, #16
++ vmovl.u8 q13, d29
++ vld1.8 {q6-q7}, [ip :128] @ {15,15,14,14,13,13...0,0}
++ vmovl.u8 q14, d30
++ vmovl.u8 q15, d31
++ vld1.32 {d10[],d11[]}, [r2] @ Left (lower)
++ sub r2, #64
++ vshl.i16 q8, q0, #4
++ vshl.i16 q9, q1, #4
++ vshl.i16 q10, q2, #4
++ vshl.i16 q11, q3, #4
++ vmla.i16 q8, q12, q4
++ vsub.i16 q0, q5, q0
++ vmla.i16 q9, q13, q4
++ vpush {q0}
++ vsub.i16 q1, q5, q1
++ vmla.i16 q10, q14, q4
++ vsub.i16 q2, q5, q2
++ vmla.i16 q11, q15, q4 @ Acc set up
++ vsub.i16 q3, q5, q3 @ Add set up
++ vadd.i16 q8, q0
++ vadd.i16 q9, q1
++ vadd.i16 q10, q2
++ vadd.i16 q11, q3
++ vmovl.u8 q4, d12
++ vmovl.u8 q5, d13
++ vmovl.u8 q6, d14
++ vmovl.u8 q7, d15
++
++@ u16 31..0 [4] q4-q7
++@ u16 left[y] [4] [r2]
++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1]
++
++ vadd.i16 q12, q8, q0
++A sub r0, r0, r3, lsl #2
++T sub r0, r3
++1:
++ vld1.32 {d0[],d1[]}, [r2]!
++A add r0, r0, r3, lsl #2
++T add r0, r3
++ vadd.i16 q13, q9, q1
++ subs r1, #2
++ vadd.i16 q14, q10, q2
++ vadd.i16 q15, q11, q3
++ vmla.i16 q8, q4, q0
++ vmla.i16 q9, q5, q0
++ vmla.i16 q10, q6, q0
++ vmla.i16 q11, q7, q0
++ vld1.16 {q0}, [sp]
++ vrshr.u16 q8, #5
++ vrshr.u16 q9, #5
++ vrshr.u16 q10, #5
++ vrshr.u16 q11, #5
++ vstm r0, {q8-q11}
++ vadd.i16 q8, q12, q0
++A add r0, r0, r3, lsl #2
++T add r0, r3
++ vld1.32 {d0[],d1[]}, [r2]!
++ vadd.i16 q9, q13, q1
++ vadd.i16 q10, q14, q2
++ vadd.i16 q11, q15, q3
++ vmla.i16 q12, q4, q0
++ vmla.i16 q13, q5, q0
++ vmla.i16 q14, q6, q0
++ vmla.i16 q15, q7, q0
++ vld1.16 {q0}, [sp]
++ vrshr.u16 q12, #5
++ vrshr.u16 q13, #5
++ vrshr.u16 q14, #5
++ vrshr.u16 q15, #5
++ vstm r0, {q12-q15}
++ vadd.i16 q12, q8, q0
++ bne 1b
++
++ vpop {q3-q7}
++ bx lr
++
++endfunc
+--- a/libavcodec/arm/vc1dsp_init_neon.c
++++ b/libavcodec/arm/vc1dsp_init_neon.c
+@@ -19,6 +19,7 @@
+ #include <stdint.h>
+
+ #include "libavutil/attributes.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ #include "vc1dsp.h"
+
+@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_
+ void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
++
+ void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int rnd);
+
+@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++ /* Dealing with starting and stopping, and removing escape bytes, are
++ * comparatively less time-sensitive, so are more clearly expressed using
++ * a C wrapper around the assembly inner loop. Note that we assume a
++ * little-endian machine that supports unaligned loads. */
++ int dsize = 0;
++ while (size >= 4)
++ {
++ int found = 0;
++ while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++ {
++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++ if (!found)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ }
++ if (!found)
++ {
++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++ dst += skip;
++ src += skip;
++ size -= skip;
++ dsize += skip;
++ while (!found && size >= 4)
++ {
++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++ if (!found)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ }
++ }
++ if (found)
++ {
++ *dst++ = *src++;
++ *dst++ = *src++;
++ ++src;
++ size -= 3;
++ dsize += 2;
++ }
++ }
++ while (size > 0)
++ {
++ *dst++ = *src++;
++ --size;
++ ++dsize;
++ }
++ return dsize;
++}
++
+ #define FN_ASSIGN(X, Y) \
+ dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+ dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
+ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+
++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon;
++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon;
++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon;
++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon;
++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+ dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
+ FN_ASSIGN(1, 0);
+ FN_ASSIGN(2, 0);
+@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPC
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+ dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+ dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+ }
+--- a/libavcodec/arm/vc1dsp_neon.S
++++ b/libavcodec/arm/vc1dsp_neon.S
+@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, e
+ vst1.32 {d1[1]}, [r0,:32]
+ bx lr
+ endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of lower block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++ sub r3, r0, r1, lsl #2
++ vldr d0, .Lcoeffs
++ vld1.32 {d1[0]}, [r0], r1 @ P5
++ vld1.32 {d2[0]}, [r3], r1 @ P1
++ vld1.32 {d3[0]}, [r3], r1 @ P2
++ vld1.32 {d4[0]}, [r0], r1 @ P6
++ vld1.32 {d5[0]}, [r3], r1 @ P3
++ vld1.32 {d6[0]}, [r0], r1 @ P7
++ vld1.32 {d7[0]}, [r3] @ P4
++ vld1.32 {d16[0]}, [r0] @ P8
++ vshll.u8 q9, d1, #1 @ 2*P5
++ vdup.16 d17, r2 @ pq
++ vshll.u8 q10, d2, #1 @ 2*P1
++ vmovl.u8 q11, d3 @ P2
++ vmovl.u8 q1, d4 @ P6
++ vmovl.u8 q12, d5 @ P3
++ vmls.i16 d20, d22, d0[1] @ 2*P1-5*P2
++ vmovl.u8 q11, d6 @ P7
++ vmls.i16 d18, d2, d0[1] @ 2*P5-5*P6
++ vshll.u8 q2, d5, #1 @ 2*P3
++ vmovl.u8 q3, d7 @ P4
++ vmla.i16 d18, d22, d0[1] @ 2*P5-5*P6+5*P7
++ vmovl.u8 q11, d16 @ P8
++ vmla.u16 d20, d24, d0[1] @ 2*P1-5*P2+5*P3
++ vmovl.u8 q12, d1 @ P5
++ vmls.u16 d4, d6, d0[1] @ 2*P3-5*P4
++ vmls.u16 d18, d22, d0[0] @ 2*P5-5*P6+5*P7-2*P8
++ vsub.i16 d1, d6, d24 @ P4-P5
++ vmls.i16 d20, d6, d0[0] @ 2*P1-5*P2+5*P3-2*P4
++ vmla.i16 d4, d24, d0[1] @ 2*P3-5*P4+5*P5
++ vmls.i16 d4, d2, d0[0] @ 2*P3-5*P4+5*P5-2*P6
++ vabs.s16 d2, d1
++ vrshr.s16 d3, d18, #3
++ vrshr.s16 d5, d20, #3
++ vshr.s16 d2, d2, #1 @ clip
++ vrshr.s16 d4, d4, #3
++ vabs.s16 d3, d3 @ a2
++ vshr.s16 d1, d1, #8 @ clip_sign
++ vabs.s16 d5, d5 @ a1
++ vceq.i16 d7, d2, #0 @ test clip == 0
++ vabs.s16 d16, d4 @ a0
++ vshr.s16 d4, d4, #8 @ a0_sign
++ vcge.s16 d18, d5, d3 @ test a1 >= a2
++ vcge.s16 d17, d16, d17 @ test a0 >= pq
++ vbsl d18, d3, d5 @ a3
++ vsub.i16 d1, d1, d4 @ clip_sign - a0_sign
++ vorr d3, d7, d17 @ test clip == 0 || a0 >= pq
++ vqsub.u16 d4, d16, d18 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 d5, d18, d16 @ test a3 >= a0
++ vmul.i16 d0, d4, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0
++ vorr d4, d3, d5 @ test clip == 0 || a0 >= pq || a3 >= a0
++ vmov.32 r0, d4[1] @ move to gp reg
++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ vcge.s16 d4, d0, d2
++ tst r0, #1
++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered
++ vbsl d4, d2, d0 @ FFMIN(d, clip)
++ vbic d0, d4, d3 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmls.i16 d6, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ vmla.i16 d24, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ vqmovun.s16 d0, q3
++ vqmovun.s16 d1, q12
++ vst1.32 {d0[0]}, [r3], r1
++ vst1.32 {d1[0]}, [r3]
++1: bx lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of right block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++ sub r3, r0, #4 @ where to start reading
++ vldr d0, .Lcoeffs
++ vld1.32 {d2}, [r3], r1
++ sub r0, r0, #1 @ where to start writing
++ vld1.32 {d4}, [r3], r1
++ vld1.32 {d3}, [r3], r1
++ vld1.32 {d5}, [r3]
++ vdup.16 d1, r2 @ pq
++ vtrn.8 q1, q2
++ vtrn.16 d2, d3 @ P1, P5, P3, P7
++ vtrn.16 d4, d5 @ P2, P6, P4, P8
++ vshll.u8 q3, d2, #1 @ 2*P1, 2*P5
++ vmovl.u8 q8, d4 @ P2, P6
++ vmovl.u8 q9, d3 @ P3, P7
++ vmovl.u8 q2, d5 @ P4, P8
++ vmls.i16 q3, q8, d0[1] @ 2*P1-5*P2, 2*P5-5*P6
++ vshll.u8 q10, d3, #1 @ 2*P3, 2*P7
++ vmovl.u8 q1, d2 @ P1, P5
++ vmla.i16 q3, q9, d0[1] @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++ vmls.i16 q3, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++ vmov d2, d3 @ needs to be in an even-numbered vector for when we come to narrow it later
++ vmls.i16 d20, d4, d0[1] @ 2*P3-5*P4
++ vmla.i16 d20, d3, d0[1] @ 2*P3-5*P4+5*P5
++ vsub.i16 d3, d4, d2 @ P4-P5
++ vmls.i16 d20, d17, d0[0] @ 2*P3-5*P4+5*P5-2*P6
++ vrshr.s16 q3, q3, #3
++ vabs.s16 d5, d3
++ vshr.s16 d3, d3, #8 @ clip_sign
++ vrshr.s16 d16, d20, #3
++ vabs.s16 q3, q3 @ a1, a2
++ vshr.s16 d5, d5, #1 @ clip
++ vabs.s16 d17, d16 @ a0
++ vceq.i16 d18, d5, #0 @ test clip == 0
++ vshr.s16 d16, d16, #8 @ a0_sign
++ vcge.s16 d19, d6, d7 @ test a1 >= a2
++ vcge.s16 d1, d17, d1 @ test a0 >= pq
++ vsub.i16 d16, d3, d16 @ clip_sign - a0_sign
++ vbsl d19, d7, d6 @ a3
++ vorr d1, d18, d1 @ test clip == 0 || a0 >= pq
++ vqsub.u16 d3, d17, d19 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 d6, d19, d17 @ test a3 >= a0 @
++ vmul.i16 d0, d3, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0
++ vorr d3, d1, d6 @ test clip == 0 || a0 >= pq || a3 >= a0
++ vmov.32 r2, d3[1] @ move to gp reg
++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ vcge.s16 d3, d0, d5
++ tst r2, #1
++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered
++ vbsl d3, d5, d0 @ FFMIN(d, clip)
++ vbic d0, d3, d1 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmla.i16 d2, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ vmls.i16 d4, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ vqmovun.s16 d1, q1
++ vqmovun.s16 d0, q2
++ vst2.8 {d0[0], d1[0]}, [r0], r1
++ vst2.8 {d0[1], d1[1]}, [r0], r1
++ vst2.8 {d0[2], d1[2]}, [r0], r1
++ vst2.8 {d0[3], d1[3]}, [r0]
++1: bx lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of lower block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++ sub r3, r0, r1, lsl #2
++ vldr d0, .Lcoeffs
++ vld1.32 {d1}, [r0 :64], r1 @ P5
++ vld1.32 {d2}, [r3 :64], r1 @ P1
++ vld1.32 {d3}, [r3 :64], r1 @ P2
++ vld1.32 {d4}, [r0 :64], r1 @ P6
++ vld1.32 {d5}, [r3 :64], r1 @ P3
++ vld1.32 {d6}, [r0 :64], r1 @ P7
++ vshll.u8 q8, d1, #1 @ 2*P5
++ vshll.u8 q9, d2, #1 @ 2*P1
++ vld1.32 {d7}, [r3 :64] @ P4
++ vmovl.u8 q1, d3 @ P2
++ vld1.32 {d20}, [r0 :64] @ P8
++ vmovl.u8 q11, d4 @ P6
++ vdup.16 q12, r2 @ pq
++ vmovl.u8 q13, d5 @ P3
++ vmls.i16 q9, q1, d0[1] @ 2*P1-5*P2
++ vmovl.u8 q1, d6 @ P7
++ vshll.u8 q2, d5, #1 @ 2*P3
++ vmls.i16 q8, q11, d0[1] @ 2*P5-5*P6
++ vmovl.u8 q3, d7 @ P4
++ vmovl.u8 q10, d20 @ P8
++ vmla.i16 q8, q1, d0[1] @ 2*P5-5*P6+5*P7
++ vmovl.u8 q1, d1 @ P5
++ vmla.i16 q9, q13, d0[1] @ 2*P1-5*P2+5*P3
++ vsub.i16 q13, q3, q1 @ P4-P5
++ vmls.i16 q2, q3, d0[1] @ 2*P3-5*P4
++ vmls.i16 q8, q10, d0[0] @ 2*P5-5*P6+5*P7-2*P8
++ vabs.s16 q10, q13
++ vshr.s16 q13, q13, #8 @ clip_sign
++ vmls.i16 q9, q3, d0[0] @ 2*P1-5*P2+5*P3-2*P4
++ vshr.s16 q10, q10, #1 @ clip
++ vmla.i16 q2, q1, d0[1] @ 2*P3-5*P4+5*P5
++ vrshr.s16 q8, q8, #3
++ vmls.i16 q2, q11, d0[0] @ 2*P3-5*P4+5*P5-2*P6
++ vceq.i16 q11, q10, #0 @ test clip == 0
++ vrshr.s16 q9, q9, #3
++ vabs.s16 q8, q8 @ a2
++ vabs.s16 q9, q9 @ a1
++ vrshr.s16 q2, q2, #3
++ vcge.s16 q14, q9, q8 @ test a1 >= a2
++ vabs.s16 q15, q2 @ a0
++ vshr.s16 q2, q2, #8 @ a0_sign
++ vbsl q14, q8, q9 @ a3
++ vcge.s16 q8, q15, q12 @ test a0 >= pq
++ vsub.i16 q2, q13, q2 @ clip_sign - a0_sign
++ vqsub.u16 q9, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q12, q14, q15 @ test a3 >= a0
++ vorr q8, q11, q8 @ test clip == 0 || a0 >= pq
++ vmul.i16 q0, q9, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0
++ vorr q9, q8, q12 @ test clip == 0 || a0 >= pq || a3 >= a0
++ vshl.i64 q11, q9, #16
++ vmov.32 r0, d18[1] @ move to gp reg
++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ vmov.32 r2, d19[1]
++ vshr.s64 q9, q11, #48
++ vcge.s16 q11, q0, q10
++ vorr q8, q8, q9
++ and r0, r0, r2
++ vbsl q11, q10, q0 @ FFMIN(d, clip)
++ tst r0, #1
++ bne 1f @ none of the 8 pixel pairs should be updated in this case
++ vbic q0, q11, q8 @ set each d to zero if it should not be filtered
++ vmls.i16 q3, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ vmla.i16 q1, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ vqmovun.s16 d0, q3
++ vqmovun.s16 d1, q1
++ vst1.32 {d0}, [r3 :64], r1
++ vst1.32 {d1}, [r3 :64]
++1: bx lr
++endfunc
++
++.align 5
++.Lcoeffs:
++.quad 0x00050002
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of right block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++ push {lr}
++ sub r3, r0, #4 @ where to start reading
++ vldr d0, .Lcoeffs
++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]...
++ sub r0, r0, #1 @ where to start writing
++ vld1.32 {d4}, [r3], r1
++ add r12, r0, r1, lsl #2
++ vld1.32 {d3}, [r3], r1
++ vld1.32 {d5}, [r3], r1
++ vld1.32 {d6}, [r3], r1
++ vld1.32 {d16}, [r3], r1
++ vld1.32 {d7}, [r3], r1
++ vld1.32 {d17}, [r3]
++ vtrn.8 q1, q2 @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
++ vdup.16 q9, r2 @ pq
++ vtrn.16 d2, d3 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++ vtrn.16 d4, d5 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++ vtrn.8 q3, q8 @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
++ vtrn.16 d6, d7 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
++ vtrn.16 d16, d17 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++ vtrn.32 d2, d6 @ P1, P5
++ vtrn.32 d4, d16 @ P2, P6
++ vtrn.32 d3, d7 @ P3, P7
++ vtrn.32 d5, d17 @ P4, P8
++ vshll.u8 q10, d2, #1 @ 2*P1
++ vshll.u8 q11, d6, #1 @ 2*P5
++ vmovl.u8 q12, d4 @ P2
++ vmovl.u8 q13, d16 @ P6
++ vmovl.u8 q14, d3 @ P3
++ vmls.i16 q10, q12, d0[1] @ 2*P1-5*P2
++ vmovl.u8 q12, d7 @ P7
++ vshll.u8 q1, d3, #1 @ 2*P3
++ vmls.i16 q11, q13, d0[1] @ 2*P5-5*P6
++ vmovl.u8 q2, d5 @ P4
++ vmovl.u8 q8, d17 @ P8
++ vmla.i16 q11, q12, d0[1] @ 2*P5-5*P6+5*P7
++ vmovl.u8 q3, d6 @ P5
++ vmla.i16 q10, q14, d0[1] @ 2*P1-5*P2+5*P3
++ vsub.i16 q12, q2, q3 @ P4-P5
++ vmls.i16 q1, q2, d0[1] @ 2*P3-5*P4
++ vmls.i16 q11, q8, d0[0] @ 2*P5-5*P6+5*P7-2*P8
++ vabs.s16 q8, q12
++ vshr.s16 q12, q12, #8 @ clip_sign
++ vmls.i16 q10, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4
++ vshr.s16 q8, q8, #1 @ clip
++ vmla.i16 q1, q3, d0[1] @ 2*P3-5*P4+5*P5
++ vrshr.s16 q11, q11, #3
++ vmls.i16 q1, q13, d0[0] @ 2*P3-5*P4+5*P5-2*P6
++ vceq.i16 q13, q8, #0 @ test clip == 0
++ vrshr.s16 q10, q10, #3
++ vabs.s16 q11, q11 @ a2
++ vabs.s16 q10, q10 @ a1
++ vrshr.s16 q1, q1, #3
++ vcge.s16 q14, q10, q11 @ test a1 >= a2
++ vabs.s16 q15, q1 @ a0
++ vshr.s16 q1, q1, #8 @ a0_sign
++ vbsl q14, q11, q10 @ a3
++ vcge.s16 q9, q15, q9 @ test a0 >= pq
++ vsub.i16 q1, q12, q1 @ clip_sign - a0_sign
++ vqsub.u16 q10, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q11, q14, q15 @ test a3 >= a0
++ vorr q9, q13, q9 @ test clip == 0 || a0 >= pq
++ vmul.i16 q0, q10, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0
++ vorr q10, q9, q11 @ test clip == 0 || a0 >= pq || a3 >= a0
++ vmov.32 r2, d20[1] @ move to gp reg
++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++ vmov.32 r3, d21[1]
++ vcge.s16 q10, q0, q8
++ and r14, r2, r3
++ vbsl q10, q8, q0 @ FFMIN(d, clip)
++ tst r14, #1
++ bne 2f @ none of the 8 pixel pairs should be updated in this case
++ vbic q0, q10, q9 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmla.i16 q3, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++ vmls.i16 q2, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++ vqmovun.s16 d1, q3
++ vqmovun.s16 d0, q2
++ tst r2, #1
++ bne 1f @ none of the first 4 pixel pairs should be updated if so
++ vst2.8 {d0[0], d1[0]}, [r0], r1
++ vst2.8 {d0[1], d1[1]}, [r0], r1
++ vst2.8 {d0[2], d1[2]}, [r0], r1
++ vst2.8 {d0[3], d1[3]}, [r0]
++1: tst r3, #1
++ bne 2f @ none of the second 4 pixel pairs should be updated if so
++ vst2.8 {d0[4], d1[4]}, [r12], r1
++ vst2.8 {d0[5], d1[5]}, [r12], r1
++ vst2.8 {d0[6], d1[6]}, [r12], r1
++ vst2.8 {d0[7], d1[7]}, [r12]
++2: pop {pc}
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of lower block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++ vpush {d8-d15}
++ sub r3, r0, r1, lsl #2
++ vldr d0, .Lcoeffs
++ vld1.64 {q1}, [r0 :128], r1 @ P5
++ vld1.64 {q2}, [r3 :128], r1 @ P1
++ vld1.64 {q3}, [r3 :128], r1 @ P2
++ vld1.64 {q4}, [r0 :128], r1 @ P6
++ vld1.64 {q5}, [r3 :128], r1 @ P3
++ vld1.64 {q6}, [r0 :128], r1 @ P7
++ vshll.u8 q7, d2, #1 @ 2*P5[0..7]
++ vshll.u8 q8, d4, #1 @ 2*P1[0..7]
++ vld1.64 {q9}, [r3 :128] @ P4
++ vmovl.u8 q10, d6 @ P2[0..7]
++ vld1.64 {q11}, [r0 :128] @ P8
++ vmovl.u8 q12, d8 @ P6[0..7]
++ vdup.16 q13, r2 @ pq
++ vshll.u8 q2, d5, #1 @ 2*P1[8..15]
++ vmls.i16 q8, q10, d0[1] @ 2*P1[0..7]-5*P2[0..7]
++ vshll.u8 q10, d3, #1 @ 2*P5[8..15]
++ vmovl.u8 q3, d7 @ P2[8..15]
++ vmls.i16 q7, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7]
++ vmovl.u8 q4, d9 @ P6[8..15]
++ vmovl.u8 q14, d10 @ P3[0..7]
++ vmovl.u8 q15, d12 @ P7[0..7]
++ vmls.i16 q2, q3, d0[1] @ 2*P1[8..15]-5*P2[8..15]
++ vshll.u8 q3, d10, #1 @ 2*P3[0..7]
++ vmls.i16 q10, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]
++ vmovl.u8 q6, d13 @ P7[8..15]
++ vmla.i16 q8, q14, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++ vmovl.u8 q14, d18 @ P4[0..7]
++ vmovl.u8 q9, d19 @ P4[8..15]
++ vmla.i16 q7, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++ vmovl.u8 q15, d11 @ P3[8..15]
++ vshll.u8 q5, d11, #1 @ 2*P3[8..15]
++ vmls.i16 q3, q14, d0[1] @ 2*P3[0..7]-5*P4[0..7]
++ vmla.i16 q2, q15, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++ vmovl.u8 q15, d22 @ P8[0..7]
++ vmovl.u8 q11, d23 @ P8[8..15]
++ vmla.i16 q10, q6, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++ vmovl.u8 q6, d2 @ P5[0..7]
++ vmovl.u8 q1, d3 @ P5[8..15]
++ vmls.i16 q5, q9, d0[1] @ 2*P3[8..15]-5*P4[8..15]
++ vmls.i16 q8, q14, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++ vmls.i16 q7, q15, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++ vsub.i16 q15, q14, q6 @ P4[0..7]-P5[0..7]
++ vmla.i16 q3, q6, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++ vrshr.s16 q8, q8, #3
++ vmls.i16 q2, q9, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++ vrshr.s16 q7, q7, #3
++ vmls.i16 q10, q11, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++ vabs.s16 q11, q15
++ vabs.s16 q8, q8 @ a1[0..7]
++ vmla.i16 q5, q1, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++ vshr.s16 q15, q15, #8 @ clip_sign[0..7]
++ vrshr.s16 q2, q2, #3
++ vmls.i16 q3, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++ vabs.s16 q7, q7 @ a2[0..7]
++ vrshr.s16 q10, q10, #3
++ vsub.i16 q12, q9, q1 @ P4[8..15]-P5[8..15]
++ vshr.s16 q11, q11, #1 @ clip[0..7]
++ vmls.i16 q5, q4, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++ vcge.s16 q4, q8, q7 @ test a1[0..7] >= a2[0..7]
++ vabs.s16 q2, q2 @ a1[8..15]
++ vrshr.s16 q3, q3, #3
++ vabs.s16 q10, q10 @ a2[8..15]
++ vbsl q4, q7, q8 @ a3[0..7]
++ vabs.s16 q7, q12
++ vshr.s16 q8, q12, #8 @ clip_sign[8..15]
++ vrshr.s16 q5, q5, #3
++ vcge.s16 q12, q2, q10 @ test a1[8..15] >= a2[8.15]
++ vshr.s16 q7, q7, #1 @ clip[8..15]
++ vbsl q12, q10, q2 @ a3[8..15]
++ vabs.s16 q2, q3 @ a0[0..7]
++ vceq.i16 q10, q11, #0 @ test clip[0..7] == 0
++ vshr.s16 q3, q3, #8 @ a0_sign[0..7]
++ vsub.i16 q3, q15, q3 @ clip_sign[0..7] - a0_sign[0..7]
++ vcge.s16 q15, q2, q13 @ test a0[0..7] >= pq
++ vorr q10, q10, q15 @ test clip[0..7] == 0 || a0[0..7] >= pq
++ vqsub.u16 q15, q2, q4 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q2, q4, q2 @ test a3[0..7] >= a0[0..7]
++ vabs.s16 q4, q5 @ a0[8..15]
++ vshr.s16 q5, q5, #8 @ a0_sign[8..15]
++ vmul.i16 q15, q15, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++ vcge.s16 q13, q4, q13 @ test a0[8..15] >= pq
++ vorr q2, q10, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++ vsub.i16 q5, q8, q5 @ clip_sign[8..15] - a0_sign[8..15]
++ vceq.i16 q8, q7, #0 @ test clip[8..15] == 0
++ vshr.u16 q15, q15, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++ vmov.32 r0, d4[1] @ move to gp reg
++ vorr q8, q8, q13 @ test clip[8..15] == 0 || a0[8..15] >= pq
++ vqsub.u16 q13, q4, q12 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vmov.32 r2, d5[1]
++ vcge.s16 q4, q12, q4 @ test a3[8..15] >= a0[8..15]
++ vshl.i64 q2, q2, #16
++ vcge.s16 q12, q15, q11
++ vmul.i16 q0, q13, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++ vorr q4, q8, q4 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++ vshr.s64 q2, q2, #48
++ and r0, r0, r2
++ vbsl q12, q11, q15 @ FFMIN(d[0..7], clip[0..7])
++ vshl.i64 q11, q4, #16
++ vmov.32 r2, d8[1]
++ vshr.u16 q0, q0, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++ vorr q2, q10, q2
++ vmov.32 r12, d9[1]
++ vshr.s64 q4, q11, #48
++ vcge.s16 q10, q0, q7
++ vbic q2, q12, q2 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++ vorr q4, q8, q4
++ and r2, r2, r12
++ vbsl q10, q7, q0 @ FFMIN(d[8..15], clip[8..15])
++ vmls.i16 q14, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++ and r0, r0, r2
++ vbic q0, q10, q4 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++ tst r0, #1
++ bne 1f @ none of the 16 pixel pairs should be updated in this case
++ vmla.i16 q6, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++ vmls.i16 q9, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++ vqmovun.s16 d4, q14
++ vmla.i16 q1, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++ vqmovun.s16 d0, q6
++ vqmovun.s16 d5, q9
++ vqmovun.s16 d1, q1
++ vst1.64 {q2}, [r3 :128], r1
++ vst1.64 {q0}, [r3 :128]
++1: vpop {d8-d15}
++ bx lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@ r0 -> top-left pel of right block
++@ r1 = row stride, bytes
++@ r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++ push {r4-r6,lr}
++ vpush {d8-d15}
++ sub r3, r0, #4 @ where to start reading
++ vldr d0, .Lcoeffs
++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]...
++ sub r0, r0, #1 @ where to start writing
++ vld1.32 {d3}, [r3], r1
++ add r4, r0, r1, lsl #2
++ vld1.32 {d10}, [r3], r1
++ vld1.32 {d11}, [r3], r1
++ vld1.32 {d16}, [r3], r1
++ vld1.32 {d4}, [r3], r1
++ vld1.32 {d8}, [r3], r1
++ vtrn.8 d2, d3 @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
++ vld1.32 {d14}, [r3], r1
++ vld1.32 {d5}, [r3], r1
++ vtrn.8 d10, d11 @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
++ vld1.32 {d6}, [r3], r1
++ vld1.32 {d12}, [r3], r1
++ vtrn.8 d16, d4 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
++ vld1.32 {d13}, [r3], r1
++ vtrn.16 d2, d10 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++ vld1.32 {d1}, [r3], r1
++ vtrn.8 d8, d14 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
++ vld1.32 {d7}, [r3], r1
++ vtrn.16 d3, d11 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++ vld1.32 {d9}, [r3], r1
++ vtrn.8 d5, d6 @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
++ vld1.32 {d15}, [r3]
++ vtrn.16 d16, d8 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
++ vtrn.16 d4, d14 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++ vtrn.8 d12, d13 @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
++ vdup.16 q9, r2 @ pq
++ vtrn.8 d1, d7 @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
++ vtrn.32 d2, d16 @ P1[0..7], P5[0..7]
++ vtrn.16 d5, d12 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
++ vtrn.16 d6, d13 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
++ vtrn.8 d9, d15 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
++ vtrn.32 d3, d4 @ P2[0..7], P6[0..7]
++ vshll.u8 q10, d2, #1 @ 2*P1[0..7]
++ vtrn.32 d10, d8 @ P3[0..7], P7[0..7]
++ vshll.u8 q11, d16, #1 @ 2*P5[0..7]
++ vtrn.32 d11, d14 @ P4[0..7], P8[0..7]
++ vtrn.16 d1, d9 @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
++ vtrn.16 d7, d15 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
++ vmovl.u8 q1, d3 @ P2[0..7]
++ vmovl.u8 q12, d4 @ P6[0..7]
++ vtrn.32 d5, d1 @ P1[8..15], P5[8..15]
++ vtrn.32 d6, d7 @ P2[8..15], P6[8..15]
++ vtrn.32 d12, d9 @ P3[8..15], P7[8..15]
++ vtrn.32 d13, d15 @ P4[8..15], P8[8..15]
++ vmls.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]
++ vmovl.u8 q1, d10 @ P3[0..7]
++ vshll.u8 q2, d5, #1 @ 2*P1[8..15]
++ vshll.u8 q13, d1, #1 @ 2*P5[8..15]
++ vmls.i16 q11, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7]
++ vmovl.u8 q14, d6 @ P2[8..15]
++ vmovl.u8 q3, d7 @ P6[8..15]
++ vmovl.u8 q15, d8 @ P7[0..7]
++ vmla.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++ vmovl.u8 q1, d12 @ P3[8..15]
++ vmls.i16 q2, q14, d0[1] @ 2*P1[8..15]-5*P2[8..15]
++ vmovl.u8 q4, d9 @ P7[8..15]
++ vshll.u8 q14, d10, #1 @ 2*P3[0..7]
++ vmls.i16 q13, q3, d0[1] @ 2*P5[8..15]-5*P6[8..15]
++ vmovl.u8 q5, d11 @ P4[0..7]
++ vmla.i16 q11, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++ vshll.u8 q15, d12, #1 @ 2*P3[8..15]
++ vmovl.u8 q6, d13 @ P4[8..15]
++ vmla.i16 q2, q1, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++ vmovl.u8 q1, d14 @ P8[0..7]
++ vmovl.u8 q7, d15 @ P8[8..15]
++ vmla.i16 q13, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++ vmovl.u8 q4, d16 @ P5[0..7]
++ vmovl.u8 q8, d1 @ P5[8..15]
++ vmls.i16 q14, q5, d0[1] @ 2*P3[0..7]-5*P4[0..7]
++ vmls.i16 q15, q6, d0[1] @ 2*P3[8..15]-5*P4[8..15]
++ vmls.i16 q10, q5, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++ vmls.i16 q11, q1, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++ vsub.i16 q1, q5, q4 @ P4[0..7]-P5[0..7]
++ vmls.i16 q2, q6, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++ vrshr.s16 q10, q10, #3
++ vmls.i16 q13, q7, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++ vsub.i16 q7, q6, q8 @ P4[8..15]-P5[8..15]
++ vrshr.s16 q11, q11, #3
++ vmla.s16 q14, q4, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++ vrshr.s16 q2, q2, #3
++ vmla.i16 q15, q8, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++ vabs.s16 q10, q10 @ a1[0..7]
++ vrshr.s16 q13, q13, #3
++ vmls.i16 q15, q3, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++ vabs.s16 q3, q11 @ a2[0..7]
++ vabs.s16 q2, q2 @ a1[8..15]
++ vmls.i16 q14, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++ vabs.s16 q11, q1
++ vabs.s16 q12, q13 @ a2[8..15]
++ vcge.s16 q13, q10, q3 @ test a1[0..7] >= a2[0..7]
++ vshr.s16 q1, q1, #8 @ clip_sign[0..7]
++ vrshr.s16 q15, q15, #3
++ vshr.s16 q11, q11, #1 @ clip[0..7]
++ vrshr.s16 q14, q14, #3
++ vbsl q13, q3, q10 @ a3[0..7]
++ vcge.s16 q3, q2, q12 @ test a1[8..15] >= a2[8.15]
++ vabs.s16 q10, q15 @ a0[8..15]
++ vshr.s16 q15, q15, #8 @ a0_sign[8..15]
++ vbsl q3, q12, q2 @ a3[8..15]
++ vabs.s16 q2, q14 @ a0[0..7]
++ vabs.s16 q12, q7
++ vshr.s16 q7, q7, #8 @ clip_sign[8..15]
++ vshr.s16 q14, q14, #8 @ a0_sign[0..7]
++ vshr.s16 q12, q12, #1 @ clip[8..15]
++ vsub.i16 q7, q7, q15 @ clip_sign[8..15] - a0_sign[8..15]
++ vqsub.u16 q15, q10, q3 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q3, q3, q10 @ test a3[8..15] >= a0[8..15]
++ vcge.s16 q10, q10, q9 @ test a0[8..15] >= pq
++ vcge.s16 q9, q2, q9 @ test a0[0..7] >= pq
++ vsub.i16 q1, q1, q14 @ clip_sign[0..7] - a0_sign[0..7]
++ vqsub.u16 q14, q2, q13 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++ vcge.s16 q2, q13, q2 @ test a3[0..7] >= a0[0..7]
++ vmul.i16 q13, q15, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++ vceq.i16 q15, q11, #0 @ test clip[0..7] == 0
++ vmul.i16 q0, q14, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++ vorr q9, q15, q9 @ test clip[0..7] == 0 || a0[0..7] >= pq
++ vceq.i16 q14, q12, #0 @ test clip[8..15] == 0
++ vshr.u16 q13, q13, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++ vorr q2, q9, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++ vshr.u16 q0, q0, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++ vorr q10, q14, q10 @ test clip[8..15] == 0 || a0[8..15] >= pq
++ vcge.s16 q14, q13, q12
++ vmov.32 r2, d4[1] @ move to gp reg
++ vorr q3, q10, q3 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++ vmov.32 r3, d5[1]
++ vcge.s16 q2, q0, q11
++ vbsl q14, q12, q13 @ FFMIN(d[8..15], clip[8..15])
++ vbsl q2, q11, q0 @ FFMIN(d[0..7], clip[0..7])
++ vmov.32 r5, d6[1]
++ vbic q0, q14, q10 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmov.32 r6, d7[1]
++ and r12, r2, r3
++ vbic q2, q2, q9 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++ vmls.i16 q6, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++ vmls.i16 q5, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++ and r14, r5, r6
++ vmla.i16 q4, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++ and r12, r12, r14
++ vqmovun.s16 d4, q6
++ vmla.i16 q8, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++ tst r12, #1
++ bne 4f @ none of the 16 pixel pairs should be updated in this case
++ vqmovun.s16 d2, q5
++ vqmovun.s16 d3, q4
++ vqmovun.s16 d5, q8
++ tst r2, #1
++ bne 1f
++ vst2.8 {d2[0], d3[0]}, [r0], r1
++ vst2.8 {d2[1], d3[1]}, [r0], r1
++ vst2.8 {d2[2], d3[2]}, [r0], r1
++ vst2.8 {d2[3], d3[3]}, [r0]
++1: add r0, r4, r1, lsl #2
++ tst r3, #1
++ bne 2f
++ vst2.8 {d2[4], d3[4]}, [r4], r1
++ vst2.8 {d2[5], d3[5]}, [r4], r1
++ vst2.8 {d2[6], d3[6]}, [r4], r1
++ vst2.8 {d2[7], d3[7]}, [r4]
++2: add r4, r0, r1, lsl #2
++ tst r5, #1
++ bne 3f
++ vst2.8 {d4[0], d5[0]}, [r0], r1
++ vst2.8 {d4[1], d5[1]}, [r0], r1
++ vst2.8 {d4[2], d5[2]}, [r0], r1
++ vst2.8 {d4[3], d5[3]}, [r0]
++3: tst r6, #1
++ bne 4f
++ vst2.8 {d4[4], d5[4]}, [r4], r1
++ vst2.8 {d4[5], d5[5]}, [r4], r1
++ vst2.8 {d4[6], d5[6]}, [r4], r1
++ vst2.8 {d4[7], d5[7]}, [r4]
++4: vpop {d8-d15}
++ pop {r4-r6,pc}
++endfunc
++
++@ Copy at most the specified number of bytes from source to destination buffer,
++@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
++@ On entry:
++@ r0 -> source buffer
++@ r1 = max number of bytes to copy
++@ r2 -> destination buffer, optimally 8-byte aligned
++@ On exit:
++@ r0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++ @ Offset by 48 to screen out cases that are too short for us to handle,
++ @ and also make it easy to test for loop termination, or to determine
++ @ whether we need an odd number of half-iterations of the loop.
++ subs r1, r1, #48
++ bmi 90f
++
++ @ Set up useful constants
++ vmov.i32 q0, #0x3000000
++ vmov.i32 q1, #0x30000
++
++ tst r1, #16
++ bne 1f
++
++ vld1.8 {q8, q9}, [r0]!
++ vbic q12, q8, q0
++ vext.8 q13, q8, q9, #1
++ vext.8 q14, q8, q9, #2
++ vext.8 q15, q8, q9, #3
++ veor q12, q12, q1
++ vbic q13, q13, q0
++ vbic q14, q14, q0
++ vbic q15, q15, q0
++ vceq.i32 q12, q12, #0
++ veor q13, q13, q1
++ veor q14, q14, q1
++ veor q15, q15, q1
++ vceq.i32 q13, q13, #0
++ vceq.i32 q14, q14, #0
++ vceq.i32 q15, q15, #0
++ add r1, r1, #16
++ b 3f
++
++1: vld1.8 {q10, q11}, [r0]!
++ vbic q12, q10, q0
++ vext.8 q13, q10, q11, #1
++ vext.8 q14, q10, q11, #2
++ vext.8 q15, q10, q11, #3
++ veor q12, q12, q1
++ vbic q13, q13, q0
++ vbic q14, q14, q0
++ vbic q15, q15, q0
++ vceq.i32 q12, q12, #0
++ veor q13, q13, q1
++ veor q14, q14, q1
++ veor q15, q15, q1
++ vceq.i32 q13, q13, #0
++ vceq.i32 q14, q14, #0
++ vceq.i32 q15, q15, #0
++ @ Drop through...
++2: vmov q8, q11
++ vld1.8 {q9}, [r0]!
++ vorr q13, q12, q13
++ vorr q15, q14, q15
++ vbic q12, q8, q0
++ vorr q3, q13, q15
++ vext.8 q13, q8, q9, #1
++ vext.8 q14, q8, q9, #2
++ vext.8 q15, q8, q9, #3
++ veor q12, q12, q1
++ vorr d6, d6, d7
++ vbic q13, q13, q0
++ vbic q14, q14, q0
++ vbic q15, q15, q0
++ vceq.i32 q12, q12, #0
++ vmov r3, r12, d6
++ veor q13, q13, q1
++ veor q14, q14, q1
++ veor q15, q15, q1
++ vceq.i32 q13, q13, #0
++ vceq.i32 q14, q14, #0
++ vceq.i32 q15, q15, #0
++ orrs r3, r3, r12
++ bne 90f
++ vst1.64 {q10}, [r2]!
++3: vmov q10, q9
++ vld1.8 {q11}, [r0]!
++ vorr q13, q12, q13
++ vorr q15, q14, q15
++ vbic q12, q10, q0
++ vorr q3, q13, q15
++ vext.8 q13, q10, q11, #1
++ vext.8 q14, q10, q11, #2
++ vext.8 q15, q10, q11, #3
++ veor q12, q12, q1
++ vorr d6, d6, d7
++ vbic q13, q13, q0
++ vbic q14, q14, q0
++ vbic q15, q15, q0
++ vceq.i32 q12, q12, #0
++ vmov r3, r12, d6
++ veor q13, q13, q1
++ veor q14, q14, q1
++ veor q15, q15, q1
++ vceq.i32 q13, q13, #0
++ vceq.i32 q14, q14, #0
++ vceq.i32 q15, q15, #0
++ orrs r3, r3, r12
++ bne 91f
++ vst1.64 {q8}, [r2]!
++ subs r1, r1, #32
++ bpl 2b
++
++90: add r0, r1, #48
++ bx lr
++
++91: sub r1, r1, #16
++ b 90b
++endfunc
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -2567,6 +2567,17 @@ typedef struct AVHWAccel {
+ * that avctx->hwaccel_priv_data is invalid.
+ */
+ int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++
++ /**
++ * Called if parsing fails
++ *
++ * An error has occured, end_frame will not be called
++ * start_frame & decode_slice may or may not have been called
++ * Optional
++ *
++ * @param avctx the codec context
++ */
++ void (*abort_frame)(AVCodecContext *avctx);
+ } AVHWAccel;
+
+ /**
+--- a/libavcodec/cabac.h
++++ b/libavcodec/cabac.h
+@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_table
+ typedef struct CABACContext{
+ int low;
+ int range;
+- int outstanding_count;
++ union
++ {
++ int outstanding_count;
++ struct {
++ uint16_t bits;
++ uint16_t range;
++ } by22;
++ };
+ const uint8_t *bytestream_start;
+ const uint8_t *bytestream;
+ const uint8_t *bytestream_end;
+--- a/libavcodec/codec.h
++++ b/libavcodec/codec.h
+@@ -350,6 +350,17 @@ const AVCodec *av_codec_iterate(void **o
+ AVCodec *avcodec_find_decoder(enum AVCodecID id);
+
+ /**
++ * Find a registered decoder with a matching codec ID and pix_fmt.
++ * A decoder will pix_fmt set to NULL will match any fmt.
++ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
++ *
++ * @param id AVCodecID of the requested decoder
++ * @param fmt AVPixelForma that msut be supported by decoder
++ * @return A decoder if one was found, NULL otherwise.
++ */
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
++
++/**
+ * Find a registered decoder with the specified name.
+ *
+ * @param name name of the requested decoder
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v1.h
+@@ -0,0 +1,229 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_MPEG_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_MPEG_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_MPEG_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_MPEG_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_MPEG_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_MPEG_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B 0
++#define V4L2_HEVC_SLICE_TYPE_P 1
++#define V4L2_HEVC_SLICE_TYPE_I 2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++ __u16 pic_width_in_luma_samples;
++ __u16 pic_height_in_luma_samples;
++ __u8 bit_depth_luma_minus8;
++ __u8 bit_depth_chroma_minus8;
++ __u8 log2_max_pic_order_cnt_lsb_minus4;
++ __u8 sps_max_dec_pic_buffering_minus1;
++ __u8 sps_max_num_reorder_pics;
++ __u8 sps_max_latency_increase_plus1;
++ __u8 log2_min_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_luma_coding_block_size;
++ __u8 log2_min_luma_transform_block_size_minus2;
++ __u8 log2_diff_max_min_luma_transform_block_size;
++ __u8 max_transform_hierarchy_depth_inter;
++ __u8 max_transform_hierarchy_depth_intra;
++ __u8 pcm_sample_bit_depth_luma_minus1;
++ __u8 pcm_sample_bit_depth_chroma_minus1;
++ __u8 log2_min_pcm_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_pcm_luma_coding_block_size;
++ __u8 num_short_term_ref_pic_sets;
++ __u8 num_long_term_ref_pics_sps;
++ __u8 chroma_format_idc;
++ __u8 sps_max_sub_layers_minus1;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++
++struct v4l2_ctrl_hevc_pps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++ __u8 num_extra_slice_header_bits;
++ __s8 init_qp_minus26;
++ __u8 diff_cu_qp_delta_depth;
++ __s8 pps_cb_qp_offset;
++ __s8 pps_cr_qp_offset;
++ __u8 num_tile_columns_minus1;
++ __u8 num_tile_rows_minus1;
++ __u8 column_width_minus1[20];
++ __u8 row_height_minus1[22];
++ __s8 pps_beta_offset_div2;
++ __s8 pps_tc_offset_div2;
++ __u8 log2_parallel_merge_level_minus2;
++
++ __u8 padding[4];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16
++
++struct v4l2_hevc_dpb_entry {
++ __u64 timestamp;
++ __u8 rps;
++ __u8 field_pic;
++ __u16 pic_order_cnt[2];
++ __u8 padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __u8 padding[6];
++
++ __u8 luma_log2_weight_denom;
++ __s8 delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++ __u32 bit_size;
++ __u32 data_bit_offset;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u32 slice_segment_addr;
++ __u32 num_entry_point_offsets;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ __u8 nal_unit_type;
++ __u8 nuh_temporal_id_plus1;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 slice_type;
++ __u8 colour_plane_id;
++ __u16 slice_pic_order_cnt;
++ __u8 num_ref_idx_l0_active_minus1;
++ __u8 num_ref_idx_l1_active_minus1;
++ __u8 collocated_ref_idx;
++ __u8 five_minus_max_num_merge_cand;
++ __s8 slice_qp_delta;
++ __s8 slice_cb_qp_offset;
++ __s8 slice_cr_qp_offset;
++ __s8 slice_act_y_qp_offset;
++ __s8 slice_act_cb_qp_offset;
++ __s8 slice_act_cr_qp_offset;
++ __s8 slice_beta_offset_div2;
++ __s8 slice_tc_offset_div2;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ __u8 pic_struct;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 num_active_dpb_entries;
++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++ __u8 num_rps_poc_st_curr_before;
++ __u8 num_rps_poc_st_curr_after;
++ __u8 num_rps_poc_lt_curr;
++
++ __u8 padding;
++
++ __u32 entry_point_offset_minus1[256];
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++ struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++ __u64 flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++ __u8 scaling_list_4x4[6][16];
++ __u8 scaling_list_8x8[6][64];
++ __u8 scaling_list_16x16[6][64];
++ __u8 scaling_list_32x32[2][64];
++ __u8 scaling_list_dc_coef_16x16[6];
++ __u8 scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v2.h
+@@ -0,0 +1,257 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B 0
++#define V4L2_HEVC_SLICE_TYPE_P 1
++#define V4L2_HEVC_SLICE_TYPE_I 2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++ __u16 pic_width_in_luma_samples;
++ __u16 pic_height_in_luma_samples;
++ __u8 bit_depth_luma_minus8;
++ __u8 bit_depth_chroma_minus8;
++ __u8 log2_max_pic_order_cnt_lsb_minus4;
++ __u8 sps_max_dec_pic_buffering_minus1;
++ __u8 sps_max_num_reorder_pics;
++ __u8 sps_max_latency_increase_plus1;
++ __u8 log2_min_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_luma_coding_block_size;
++ __u8 log2_min_luma_transform_block_size_minus2;
++ __u8 log2_diff_max_min_luma_transform_block_size;
++ __u8 max_transform_hierarchy_depth_inter;
++ __u8 max_transform_hierarchy_depth_intra;
++ __u8 pcm_sample_bit_depth_luma_minus1;
++ __u8 pcm_sample_bit_depth_chroma_minus1;
++ __u8 log2_min_pcm_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_pcm_luma_coding_block_size;
++ __u8 num_short_term_ref_pic_sets;
++ __u8 num_long_term_ref_pics_sps;
++ __u8 chroma_format_idc;
++ __u8 sps_max_sub_layers_minus1;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++ __u8 num_extra_slice_header_bits;
++ __u8 num_ref_idx_l0_default_active_minus1;
++ __u8 num_ref_idx_l1_default_active_minus1;
++ __s8 init_qp_minus26;
++ __u8 diff_cu_qp_delta_depth;
++ __s8 pps_cb_qp_offset;
++ __s8 pps_cr_qp_offset;
++ __u8 num_tile_columns_minus1;
++ __u8 num_tile_rows_minus1;
++ __u8 column_width_minus1[20];
++ __u8 row_height_minus1[22];
++ __s8 pps_beta_offset_div2;
++ __s8 pps_tc_offset_div2;
++ __u8 log2_parallel_merge_level_minus2;
++
++ __u8 padding[4];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16
++
++struct v4l2_hevc_dpb_entry {
++ __u64 timestamp;
++ __u8 rps;
++ __u8 field_pic;
++ __u16 pic_order_cnt[2];
++ __u8 padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __u8 padding[6];
++
++ __u8 luma_log2_weight_denom;
++ __s8 delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++ __u32 bit_size;
++ __u32 data_bit_offset;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u32 slice_segment_addr;
++ __u32 num_entry_point_offsets;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ __u8 nal_unit_type;
++ __u8 nuh_temporal_id_plus1;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 slice_type;
++ __u8 colour_plane_id;
++ __u16 slice_pic_order_cnt;
++ __u8 num_ref_idx_l0_active_minus1;
++ __u8 num_ref_idx_l1_active_minus1;
++ __u8 collocated_ref_idx;
++ __u8 five_minus_max_num_merge_cand;
++ __s8 slice_qp_delta;
++ __s8 slice_cb_qp_offset;
++ __s8 slice_cr_qp_offset;
++ __s8 slice_act_y_qp_offset;
++ __s8 slice_act_cb_qp_offset;
++ __s8 slice_act_cr_qp_offset;
++ __s8 slice_beta_offset_div2;
++ __s8 slice_tc_offset_div2;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ __u8 pic_struct;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++ __u8 padding[5];
++
++ __u32 entry_point_offset_minus1[256];
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++ struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++ __s32 pic_order_cnt_val;
++ __u8 num_active_dpb_entries;
++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 num_poc_st_curr_before;
++ __u8 num_poc_st_curr_after;
++ __u8 num_poc_lt_curr;
++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u64 flags;
++};
++
++/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++ __u8 scaling_list_4x4[6][16];
++ __u8 scaling_list_8x8[6][64];
++ __u8 scaling_list_16x16[6][64];
++ __u8 scaling_list_32x32[2][64];
++ __u8 scaling_list_dc_coef_16x16[6];
++ __u8 scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v3.h
+@@ -0,0 +1,255 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B 0
++#define V4L2_HEVC_SLICE_TYPE_P 1
++#define V4L2_HEVC_SLICE_TYPE_I 2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++ __u16 pic_width_in_luma_samples;
++ __u16 pic_height_in_luma_samples;
++ __u8 bit_depth_luma_minus8;
++ __u8 bit_depth_chroma_minus8;
++ __u8 log2_max_pic_order_cnt_lsb_minus4;
++ __u8 sps_max_dec_pic_buffering_minus1;
++ __u8 sps_max_num_reorder_pics;
++ __u8 sps_max_latency_increase_plus1;
++ __u8 log2_min_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_luma_coding_block_size;
++ __u8 log2_min_luma_transform_block_size_minus2;
++ __u8 log2_diff_max_min_luma_transform_block_size;
++ __u8 max_transform_hierarchy_depth_inter;
++ __u8 max_transform_hierarchy_depth_intra;
++ __u8 pcm_sample_bit_depth_luma_minus1;
++ __u8 pcm_sample_bit_depth_chroma_minus1;
++ __u8 log2_min_pcm_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_pcm_luma_coding_block_size;
++ __u8 num_short_term_ref_pic_sets;
++ __u8 num_long_term_ref_pics_sps;
++ __u8 chroma_format_idc;
++ __u8 sps_max_sub_layers_minus1;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++ __u8 num_extra_slice_header_bits;
++ __u8 num_ref_idx_l0_default_active_minus1;
++ __u8 num_ref_idx_l1_default_active_minus1;
++ __s8 init_qp_minus26;
++ __u8 diff_cu_qp_delta_depth;
++ __s8 pps_cb_qp_offset;
++ __s8 pps_cr_qp_offset;
++ __u8 num_tile_columns_minus1;
++ __u8 num_tile_rows_minus1;
++ __u8 column_width_minus1[20];
++ __u8 row_height_minus1[22];
++ __s8 pps_beta_offset_div2;
++ __s8 pps_tc_offset_div2;
++ __u8 log2_parallel_merge_level_minus2;
++
++ __u8 padding[4];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16
++
++struct v4l2_hevc_dpb_entry {
++ __u64 timestamp;
++ __u8 flags;
++ __u8 field_pic;
++ __u16 pic_order_cnt[2];
++ __u8 padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __u8 padding[6];
++
++ __u8 luma_log2_weight_denom;
++ __s8 delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++ __u32 bit_size;
++ __u32 data_bit_offset;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u32 slice_segment_addr;
++ __u32 num_entry_point_offsets;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ __u8 nal_unit_type;
++ __u8 nuh_temporal_id_plus1;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 slice_type;
++ __u8 colour_plane_id;
++ __u16 slice_pic_order_cnt;
++ __u8 num_ref_idx_l0_active_minus1;
++ __u8 num_ref_idx_l1_active_minus1;
++ __u8 collocated_ref_idx;
++ __u8 five_minus_max_num_merge_cand;
++ __s8 slice_qp_delta;
++ __s8 slice_cb_qp_offset;
++ __s8 slice_cr_qp_offset;
++ __s8 slice_act_y_qp_offset;
++ __s8 slice_act_cb_qp_offset;
++ __s8 slice_act_cr_qp_offset;
++ __s8 slice_beta_offset_div2;
++ __s8 slice_tc_offset_div2;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ __u8 pic_struct;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++ __u8 padding[5];
++
++ __u32 entry_point_offset_minus1[256];
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++ struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++ __s32 pic_order_cnt_val;
++ __u8 num_active_dpb_entries;
++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 num_poc_st_curr_before;
++ __u8 num_poc_st_curr_after;
++ __u8 num_poc_lt_curr;
++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u64 flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++ __u8 scaling_list_4x4[6][16];
++ __u8 scaling_list_8x8[6][64];
++ __u8 scaling_list_16x16[6][64];
++ __u8 scaling_list_32x32[2][64];
++ __u8 scaling_list_dc_coef_16x16[6];
++ __u8 scaling_list_dc_coef_32x32[2];
++};
++
++/* MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE (V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP (V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++#endif
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v4.h
+@@ -0,0 +1,515 @@
++/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
++/*
++ * Video for Linux Two controls header file
++ *
++ * Copyright (C) 1999-2012 the contributors
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * Alternatively you can redistribute this file under the terms of the
++ * BSD license as stated below:
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in
++ * the documentation and/or other materials provided with the
++ * distribution.
++ * 3. The names of its contributors may not be used to endorse or promote
++ * products derived from this software without specific prior written
++ * permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
++ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ * The contents of this header was split off from videodev2.h. All control
++ * definitions should be added to this header, which is included by
++ * videodev2.h.
++ */
++
++#ifndef AVCODEC_HEVC_CTRLS_V4_H
++#define AVCODEC_HEVC_CTRLS_V4_H
++
++#include <linux/const.h>
++#include <linux/types.h>
++
++#define V4L2_CID_STATELESS_HEVC_SPS (V4L2_CID_CODEC_STATELESS_BASE + 400)
++#define V4L2_CID_STATELESS_HEVC_PPS (V4L2_CID_CODEC_STATELESS_BASE + 401)
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 402)
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX (V4L2_CID_CODEC_STATELESS_BASE + 403)
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 404)
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE (V4L2_CID_CODEC_STATELESS_BASE + 405)
++#define V4L2_CID_STATELESS_HEVC_START_CODE (V4L2_CID_CODEC_STATELESS_BASE + 406)
++#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
++
++enum v4l2_stateless_hevc_decode_mode {
++ V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
++ V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_stateless_hevc_start_code {
++ V4L2_STATELESS_HEVC_START_CODE_NONE,
++ V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B 0
++#define V4L2_HEVC_SLICE_TYPE_P 1
++#define V4L2_HEVC_SLICE_TYPE_I 2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8)
++
++/**
++ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
++ *
++ * @video_parameter_set_id: specifies the value of the
++ * vps_video_parameter_set_id of the active VPS
++ * @seq_parameter_set_id: provides an identifier for the SPS for
++ * reference by other syntax elements
++ * @pic_width_in_luma_samples: specifies the width of each decoded picture
++ * in units of luma samples
++ * @pic_height_in_luma_samples: specifies the height of each decoded picture
++ * in units of luma samples
++ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
++ * samples of the luma array
++ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
++ * samples of the chroma arrays
++ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
++ * the variable MaxPicOrderCntLsb
++ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
++ * required size of the decoded picture
++ * buffer for the codec video sequence
++ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
++ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
++ * value of SpsMaxLatencyPictures array
++ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
++ * luma coding block size
++ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
++ * the maximum and minimum luma
++ * coding block size
++ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
++ * transform block size
++ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
++ * the maximum and minimum luma
++ * transform block size
++ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
++ * depth for transform units of
++ * coding units coded in inter
++ * prediction mode
++ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
++ * depth for transform units of
++ * coding units coded in intra
++ * prediction mode
++ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
++ * bits used to represent each of PCM sample
++ * values of the luma component
++ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
++ * of bits used to represent each of PCM
++ * sample values of the chroma components
++ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
++ * minimum size of coding blocks
++ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
++ * the maximum and minimum size of
++ * coding blocks
++ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
++ * syntax structures included in the SPS
++ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
++ * reference pictures that are specified in the SPS
++ * @chroma_format_idc: specifies the chroma sampling
++ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
++ * of temporal sub-layers
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_sps {
++ __u8 video_parameter_set_id;
++ __u8 seq_parameter_set_id;
++ __u16 pic_width_in_luma_samples;
++ __u16 pic_height_in_luma_samples;
++ __u8 bit_depth_luma_minus8;
++ __u8 bit_depth_chroma_minus8;
++ __u8 log2_max_pic_order_cnt_lsb_minus4;
++ __u8 sps_max_dec_pic_buffering_minus1;
++ __u8 sps_max_num_reorder_pics;
++ __u8 sps_max_latency_increase_plus1;
++ __u8 log2_min_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_luma_coding_block_size;
++ __u8 log2_min_luma_transform_block_size_minus2;
++ __u8 log2_diff_max_min_luma_transform_block_size;
++ __u8 max_transform_hierarchy_depth_inter;
++ __u8 max_transform_hierarchy_depth_intra;
++ __u8 pcm_sample_bit_depth_luma_minus1;
++ __u8 pcm_sample_bit_depth_chroma_minus1;
++ __u8 log2_min_pcm_luma_coding_block_size_minus3;
++ __u8 log2_diff_max_min_pcm_luma_coding_block_size;
++ __u8 num_short_term_ref_pic_sets;
++ __u8 num_long_term_ref_pics_sps;
++ __u8 chroma_format_idc;
++ __u8 sps_max_sub_layers_minus1;
++
++ __u8 reserved[6];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED (1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT (1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING (1ULL << 20)
++
++/**
++ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
++ *
++ * @pic_parameter_set_id: identifies the PPS for reference by other
++ * syntax elements
++ * @num_extra_slice_header_bits: specifies the number of extra slice header
++ * bits that are present in the slice header RBSP
++ * for coded pictures referring to the PPS.
++ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
++ * inferred value of num_ref_idx_l0_active_minus1
++ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
++ * inferred value of num_ref_idx_l1_active_minus1
++ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
++ * each slice referring to the PPS
++ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
++ * tree block size and the minimum luma coding block
++ * size of coding units that convey cu_qp_delta_abs
++ * and cu_qp_delta_sign_flag
++ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
++ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
++ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
++ * partitioning the picture
++ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
++ * the picture
++ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
++ * units of coding tree blocks
++ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
++ * units of coding tree blocks
++ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
++ * beta divided by 2
++ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
++ * divided by 2
++ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
++ * the variable Log2ParMrgLevel
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_PPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_pps {
++ __u8 pic_parameter_set_id;
++ __u8 num_extra_slice_header_bits;
++ __u8 num_ref_idx_l0_default_active_minus1;
++ __u8 num_ref_idx_l1_default_active_minus1;
++ __s8 init_qp_minus26;
++ __u8 diff_cu_qp_delta_depth;
++ __s8 pps_cb_qp_offset;
++ __s8 pps_cr_qp_offset;
++ __u8 num_tile_columns_minus1;
++ __u8 num_tile_rows_minus1;
++ __u8 column_width_minus1[20];
++ __u8 row_height_minus1[22];
++ __s8 pps_beta_offset_div2;
++ __s8 pps_tc_offset_div2;
++ __u8 log2_parallel_merge_level_minus2;
++ __u8 reserved;
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE 0x01
++
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME 0
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD 1
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD 2
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM 3
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP 4
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP 5
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM 6
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING 7
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING 8
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM 9
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP 10
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM 11
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP 12
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16
++
++/**
++ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
++ *
++ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
++ * @flags: long term flag for the reference frame
++ * @field_pic: whether the reference is a field picture or a frame.
++ * @reserved: padding field. Should be zeroed by applications.
++ * @pic_order_cnt_val: the picture order count of the current picture.
++ */
++struct v4l2_hevc_dpb_entry {
++ __u64 timestamp;
++ __u8 flags;
++ __u8 field_pic;
++ __u16 reserved;
++ __s32 pic_order_cnt_val;
++};
++
++/**
++ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
++ *
++ * @delta_luma_weight_l0: the difference of the weighting factor applied
++ * to the luma prediction value for list 0
++ * @luma_offset_l0: the additive offset applied to the luma prediction value
++ * for list 0
++ * @delta_chroma_weight_l0: the difference of the weighting factor applied
++ * to the chroma prediction values for list 0
++ * @chroma_offset_l0: the difference of the additive offset applied to
++ * the chroma prediction values for list 0
++ * @delta_luma_weight_l1: the difference of the weighting factor applied
++ * to the luma prediction value for list 1
++ * @luma_offset_l1: the additive offset applied to the luma prediction value
++ * for list 1
++ * @delta_chroma_weight_l1: the difference of the weighting factor applied
++ * to the chroma prediction values for list 1
++ * @chroma_offset_l1: the difference of the additive offset applied to
++ * the chroma prediction values for list 1
++ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
++ * all luma weighting factors
++ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
++ * of the denominator for all chroma
++ * weighting factors
++ */
++struct v4l2_hevc_pred_weight_table {
++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++ __u8 luma_log2_weight_denom;
++ __s8 delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9)
++
++/**
++ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
++ *
++ * This control is a dynamically sized 1-dimensional array,
++ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
++ *
++ * @bit_size: size (in bits) of the current slice data
++ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
++ * @num_entry_point_offsets: specifies the number of entry point offset syntax
++ * elements in the slice header.
++ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
++ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
++ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
++ * @colour_plane_id: specifies the colour plane associated with the current slice
++ * @slice_pic_order_cnt: specifies the picture order count
++ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
++ * reference index for reference picture list 0
++ * that may be used to decode the slice
++ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
++ * reference index for reference picture list 1
++ * that may be used to decode the slice
++ * @collocated_ref_idx: specifies the reference index of the collocated picture used
++ * for temporal motion vector prediction
++ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
++ * motion vector prediction candidates supported in
++ * the slice subtracted from 5
++ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
++ * blocks in the slice
++ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
++ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
++ * @slice_act_y_qp_offset: screen content extension parameters
++ * @slice_act_cb_qp_offset: screen content extension parameters
++ * @slice_act_cr_qp_offset: screen content extension parameters
++ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
++ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
++ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
++ * more fields
++ * @reserved0: padding field. Should be zeroed by applications.
++ * @slice_segment_addr: specifies the address of the first coding tree block in
++ * the slice segment
++ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
++ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ * pictures set included in the SPS
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ * pictures set include in the SPS
++ * @pred_weight_table: the prediction weight coefficients for inter-picture
++ * prediction
++ * @reserved1: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_slice_params {
++ __u32 bit_size;
++ __u32 data_byte_offset;
++ __u32 num_entry_point_offsets;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ __u8 nal_unit_type;
++ __u8 nuh_temporal_id_plus1;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u8 slice_type;
++ __u8 colour_plane_id;
++ __s32 slice_pic_order_cnt;
++ __u8 num_ref_idx_l0_active_minus1;
++ __u8 num_ref_idx_l1_active_minus1;
++ __u8 collocated_ref_idx;
++ __u8 five_minus_max_num_merge_cand;
++ __s8 slice_qp_delta;
++ __s8 slice_cb_qp_offset;
++ __s8 slice_cr_qp_offset;
++ __s8 slice_act_y_qp_offset;
++ __s8 slice_act_cb_qp_offset;
++ __s8 slice_act_cr_qp_offset;
++ __s8 slice_beta_offset_div2;
++ __s8 slice_tc_offset_div2;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ __u8 pic_struct;
++
++ __u8 reserved0[3];
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ __u32 slice_segment_addr;
++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u16 short_term_ref_pic_set_size;
++ __u16 long_term_ref_pic_set_size;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++ struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++ __u8 reserved1[2];
++ __u64 flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC 0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC 0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR 0x4
++
++/**
++ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
++ *
++ * @pic_order_cnt_val: picture order count
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ * pictures set included in the SPS of the first slice
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ * pictures set include in the SPS of the first slice
++ * @num_active_dpb_entries: the number of entries in dpb
++ * @num_poc_st_curr_before: the number of reference pictures in the short-term
++ * set that come before the current frame
++ * @num_poc_st_curr_after: the number of reference pictures in the short-term
++ * set that come after the current frame
++ * @num_poc_lt_curr: the number of reference pictures in the long-term set
++ * @poc_st_curr_before: provides the index of the short term before references
++ * in DPB array
++ * @poc_st_curr_after: provides the index of the short term after references
++ * in DPB array
++ * @poc_lt_curr: provides the index of the long term references in DPB array
++ * @reserved: padding field. Should be zeroed by applications.
++ * @dpb: the decoded picture buffer, for meta-data about reference frames
++ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_decode_params {
++ __s32 pic_order_cnt_val;
++ __u16 short_term_ref_pic_set_size;
++ __u16 long_term_ref_pic_set_size;
++ __u8 num_active_dpb_entries;
++ __u8 num_poc_st_curr_before;
++ __u8 num_poc_st_curr_after;
++ __u8 num_poc_lt_curr;
++ __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u8 reserved[4];
++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++ __u64 flags;
++};
++
++/**
++ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
++ *
++ * @scaling_list_4x4: scaling list is used for the scaling process for
++ * transform coefficients. The values on each scaling
++ * list are expected in raster scan order
++ * @scaling_list_8x8: scaling list is used for the scaling process for
++ * transform coefficients. The values on each scaling
++ * list are expected in raster scan order
++ * @scaling_list_16x16: scaling list is used for the scaling process for
++ * transform coefficients. The values on each scaling
++ * list are expected in raster scan order
++ * @scaling_list_32x32: scaling list is used for the scaling process for
++ * transform coefficients. The values on each scaling
++ * list are expected in raster scan order
++ * @scaling_list_dc_coef_16x16: scaling list is used for the scaling process
++ * for transform coefficients. The values on each
++ * scaling list are expected in raster scan order.
++ * @scaling_list_dc_coef_32x32: scaling list is used for the scaling process
++ * for transform coefficients. The values on each
++ * scaling list are expected in raster scan order.
++ */
++struct v4l2_ctrl_hevc_scaling_matrix {
++ __u8 scaling_list_4x4[6][16];
++ __u8 scaling_list_8x8[6][64];
++ __u8 scaling_list_16x16[6][64];
++ __u8 scaling_list_32x32[2][64];
++ __u8 scaling_list_dc_coef_16x16[6];
++ __u8 scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+--- a/libavcodec/hevc_parser.c
++++ b/libavcodec/hevc_parser.c
+@@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCod
+ avctx->profile = ps->sps->ptl.general_ptl.profile_idc;
+ avctx->level = ps->sps->ptl.general_ptl.level_idc;
+
++ if (ps->sps->chroma_format_idc == 1) {
++ avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ?
++ ps->sps->vui.chroma_sample_loc_type_top_field + 1 :
++ AVCHROMA_LOC_LEFT;
++ }
++ else if (ps->sps->chroma_format_idc == 2 ||
++ ps->sps->chroma_format_idc == 3) {
++ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
++ }
++ else {
++ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
++ }
++
+ if (ps->vps->vps_timing_info_present_flag) {
+ num = ps->vps->vps_num_units_in_tick;
+ den = ps->vps->vps_time_scale;
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContex
+ if (!frame->rpl_buf)
+ goto fail;
+
+- frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+- if (!frame->tab_mvf_buf)
+- goto fail;
+- frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++ if (s->tab_mvf_pool) {
++ frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
++ if (!frame->tab_mvf_buf)
++ goto fail;
++ frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++ }
+
+- frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+- if (!frame->rpl_tab_buf)
+- goto fail;
+- frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data;
+- frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+- for (j = 0; j < frame->ctb_count; j++)
+- frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++ if (s->rpl_tab_pool) {
++ frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
++ if (!frame->rpl_tab_buf)
++ goto fail;
++ frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data;
++ frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
++ for (j = 0; j < frame->ctb_count; j++)
++ frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++ }
+
+ frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+ frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s
+ int ctb_count = frame->ctb_count;
+ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+ int i;
++ RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+
+ if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
+ return AVERROR_INVALIDDATA;
+
+- for (i = ctb_addr_ts; i < ctb_count; i++)
+- frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
++ if (frame->rpl_tab) {
++ for (i = ctb_addr_ts; i < ctb_count; i++)
++ frame->rpl_tab[i] = tab;
++ }
+
+- frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
++ frame->refPicList = tab->refPicList;
+
+ return 0;
+ }
+--- a/libavcodec/hevcdec.c
++++ b/libavcodec/hevcdec.c
+@@ -332,6 +332,19 @@ static void export_stream_params(HEVCCon
+
+ ff_set_sar(avctx, sps->vui.sar);
+
++ if (sps->chroma_format_idc == 1) {
++ avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ?
++ sps->vui.chroma_sample_loc_type_top_field + 1 :
++ AVCHROMA_LOC_LEFT;
++ }
++ else if (sps->chroma_format_idc == 2 ||
++ sps->chroma_format_idc == 3) {
++ avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
++ }
++ else {
++ avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
++ }
++
+ if (sps->vui.video_signal_type_present_flag)
+ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
+ : AVCOL_RANGE_MPEG;
+@@ -372,14 +385,20 @@ static enum AVPixelFormat get_format(HEV
+ #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
+ CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
+ CONFIG_HEVC_NVDEC_HWACCEL + \
++ CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
+ CONFIG_HEVC_VAAPI_HWACCEL + \
+ CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
++ CONFIG_HEVC_RPI4_8_HWACCEL + \
++ CONFIG_HEVC_RPI4_10_HWACCEL + \
+ CONFIG_HEVC_VDPAU_HWACCEL)
+ enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+
+ switch (sps->pix_fmt) {
+ case AV_PIX_FMT_YUV420P:
+ case AV_PIX_FMT_YUVJ420P:
++#if CONFIG_HEVC_RPI4_8_HWACCEL
++ *fmt++ = AV_PIX_FMT_RPI4_8;
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+ *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -399,8 +418,14 @@ static enum AVPixelFormat get_format(HEV
+ #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+ *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
+ #endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++ *fmt++ = AV_PIX_FMT_DRM_PRIME;
++#endif
+ break;
+ case AV_PIX_FMT_YUV420P10:
++#if CONFIG_HEVC_RPI4_10_HWACCEL
++ *fmt++ = AV_PIX_FMT_RPI4_10;
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+ *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -417,6 +442,9 @@ static enum AVPixelFormat get_format(HEV
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+ *fmt++ = AV_PIX_FMT_CUDA;
+ #endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++ *fmt++ = AV_PIX_FMT_DRM_PRIME;
++#endif
+ break;
+ case AV_PIX_FMT_YUV444P:
+ #if CONFIG_HEVC_VDPAU_HWACCEL
+@@ -459,6 +487,16 @@ static int set_sps(HEVCContext *s, const
+ if (!sps)
+ return 0;
+
++ // If hwaccel then we don't need all the s/w decode helper arrays
++ if (s->avctx->hwaccel) {
++ export_stream_params(s, sps);
++
++ s->avctx->pix_fmt = pix_fmt;
++ s->ps.sps = sps;
++ s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++ return 0;
++ }
++
+ ret = pic_arrays_init(s, sps);
+ if (ret < 0)
+ goto fail;
+@@ -2809,11 +2847,13 @@ static int hevc_frame_start(HEVCContext
+ ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
+ int ret;
+
+- memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+- memset(s->vertical_bs, 0, s->bs_width * s->bs_height);
+- memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+- memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+- memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++ if (s->horizontal_bs) {
++ memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
++ memset(s->vertical_bs, 0, s->bs_width * s->bs_height);
++ memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
++ memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
++ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++ }
+
+ s->is_decoded = 0;
+ s->first_nal_type = s->nal_unit_type;
+@@ -3230,7 +3270,14 @@ static int hevc_decode_frame(AVCodecCont
+ s->ref = NULL;
+ ret = decode_nal_units(s, avpkt->data, avpkt->size);
+ if (ret < 0)
++ {
++ // Ensure that hwaccel knows this frame is over
++ if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) {
++ s->avctx->hwaccel->abort_frame(s->avctx);
++ }
++
+ return ret;
++ }
+
+ if (avctx->hwaccel) {
+ if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
+@@ -3273,15 +3320,19 @@ static int hevc_ref_frame(HEVCContext *s
+ if (ret < 0)
+ return ret;
+
+- dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+- if (!dst->tab_mvf_buf)
+- goto fail;
+- dst->tab_mvf = src->tab_mvf;
++ if (src->tab_mvf_buf) {
++ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
++ if (!dst->tab_mvf_buf)
++ goto fail;
++ dst->tab_mvf = src->tab_mvf;
++ }
+
+- dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+- if (!dst->rpl_tab_buf)
+- goto fail;
+- dst->rpl_tab = src->rpl_tab;
++ if (src->rpl_tab_buf) {
++ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
++ if (!dst->rpl_tab_buf)
++ goto fail;
++ dst->rpl_tab = src->rpl_tab;
++ }
+
+ dst->rpl_buf = av_buffer_ref(src->rpl_buf);
+ if (!dst->rpl_buf)
+@@ -3585,6 +3636,15 @@ AVCodec ff_hevc_decoder = {
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+ HWACCEL_NVDEC(hevc),
+ #endif
++#if CONFIG_HEVC_RPI4_8_HWACCEL
++ HWACCEL_RPI4_8(hevc),
++#endif
++#if CONFIG_HEVC_RPI4_10_HWACCEL
++ HWACCEL_RPI4_10(hevc),
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++ HWACCEL_V4L2REQUEST(hevc),
++#endif
+ #if CONFIG_HEVC_VAAPI_HWACCEL
+ HWACCEL_VAAPI(hevc),
+ #endif
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -34,6 +34,9 @@ extern const AVHWAccel ff_hevc_d3d11va_h
+ extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_hevc_dxva2_hwaccel;
+ extern const AVHWAccel ff_hevc_nvdec_hwaccel;
++extern const AVHWAccel ff_hevc_rpi4_8_hwaccel;
++extern const AVHWAccel ff_hevc_rpi4_10_hwaccel;
++extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
+ extern const AVHWAccel ff_hevc_vaapi_hwaccel;
+ extern const AVHWAccel ff_hevc_vdpau_hwaccel;
+ extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
+--- a/libavcodec/hwconfig.h
++++ b/libavcodec/hwconfig.h
+@@ -24,6 +24,7 @@
+
+
+ #define HWACCEL_CAP_ASYNC_SAFE (1 << 0)
++#define HWACCEL_CAP_MT_SAFE (1 << 1)
+
+
+ typedef struct AVCodecHWConfigInternal {
+@@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal {
+ HW_CONFIG_HWACCEL(1, 1, 0, D3D11, D3D11VA, ff_ ## codec ## _d3d11va2_hwaccel)
+ #define HWACCEL_NVDEC(codec) \
+ HW_CONFIG_HWACCEL(1, 1, 0, CUDA, CUDA, ff_ ## codec ## _nvdec_hwaccel)
++#define HWACCEL_RPI4_8(codec) \
++ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8, NONE, ff_ ## codec ## _rpi4_8_hwaccel)
++#define HWACCEL_RPI4_10(codec) \
++ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10, NONE, ff_ ## codec ## _rpi4_10_hwaccel)
++#define HWACCEL_V4L2REQUEST(codec) \
++ HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME, DRM, ff_ ## codec ## _v4l2request_hwaccel)
+ #define HWACCEL_VAAPI(codec) \
+ HW_CONFIG_HWACCEL(1, 1, 1, VAAPI, VAAPI, ff_ ## codec ## _vaapi_hwaccel)
+ #define HWACCEL_VDPAU(codec) \
+--- a/libavcodec/mmaldec.c
++++ b/libavcodec/mmaldec.c
+@@ -24,6 +24,9 @@
+ * MMAL Video Decoder
+ */
+
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
+ #include <bcm_host.h>
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/mmal_parameters_video.h>
+@@ -31,6 +34,7 @@
+ #include <interface/mmal/util/mmal_util_params.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+ #include <interface/mmal/vc/mmal_vc_api.h>
++#pragma GCC diagnostic pop
+ #include <stdatomic.h>
+
+ #include "avcodec.h"
+--- a/libavcodec/pthread_frame.c
++++ b/libavcodec/pthread_frame.c
+@@ -191,7 +191,8 @@ static attribute_align_arg void *frame_w
+
+ /* if the previous thread uses hwaccel then we take the lock to ensure
+ * the threads don't run concurrently */
+- if (avctx->hwaccel) {
++ if (avctx->hwaccel &&
++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+ pthread_mutex_lock(&p->parent->hwaccel_mutex);
+ p->hwaccel_serializing = 1;
+ }
+@@ -614,7 +615,9 @@ void ff_thread_finish_setup(AVCodecConte
+
+ if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
+
+- if (avctx->hwaccel && !p->hwaccel_serializing) {
++ if (avctx->hwaccel &&
++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
++ !p->hwaccel_serializing) {
+ pthread_mutex_lock(&p->parent->hwaccel_mutex);
+ p->hwaccel_serializing = 1;
+ }
+--- a/libavcodec/raw.c
++++ b/libavcodec/raw.c
+@@ -293,6 +293,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags
+ { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */
+ { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+
++ /* RPI (Might as well define for everything) */
++ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') },
++ { AV_PIX_FMT_RPI4_8, MKTAG('S', 'A', 'N', 'D') },
++ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') },
++ { AV_PIX_FMT_RPI4_10, MKTAG('S', 'N', 'D', 'B') },
++
+ { AV_PIX_FMT_NONE, 0 },
+ };
+
+--- a/libavcodec/rawenc.c
++++ b/libavcodec/rawenc.c
+@@ -24,6 +24,7 @@
+ * Raw Video Encoder
+ */
+
++#include "config.h"
+ #include "avcodec.h"
+ #include "raw.h"
+ #include "internal.h"
+@@ -31,6 +32,10 @@
+ #include "libavutil/intreadwrite.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
++#include "libavutil/avassert.h"
++#if CONFIG_SAND
++#include "libavutil/rpi_sand_fns.h"
++#endif
+
+ static av_cold int raw_encode_init(AVCodecContext *avctx)
+ {
+@@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS
+ return 0;
+ }
+
++#if CONFIG_SAND
++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++ const AVFrame *frame)
++{
++ const int width = av_frame_cropped_width(frame);
++ const int height = av_frame_cropped_height(frame);
++ const int x0 = frame->crop_left;
++ const int y0 = frame->crop_top;
++ const int size = width * height * 3 / 2;
++ uint8_t * dst;
++ int ret;
++
++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++ return ret;
++
++ dst = pkt->data;
++
++ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++ dst += width * height;
++ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
++ return 0;
++}
++
++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++ const AVFrame *frame)
++{
++ const int width = av_frame_cropped_width(frame);
++ const int height = av_frame_cropped_height(frame);
++ const int x0 = frame->crop_left;
++ const int y0 = frame->crop_top;
++ const int size = width * height * 3;
++ uint8_t * dst;
++ int ret;
++
++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++ return ret;
++
++ dst = pkt->data;
++
++ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
++ dst += width * height * 2;
++ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
++ return 0;
++}
++
++static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++ const AVFrame *frame)
++{
++ const int width = av_frame_cropped_width(frame);
++ const int height = av_frame_cropped_height(frame);
++ const int x0 = frame->crop_left;
++ const int y0 = frame->crop_top;
++ const int size = width * height * 3;
++ uint8_t * dst;
++ int ret;
++
++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++ return ret;
++
++ dst = pkt->data;
++
++ av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++ dst += width * height * 2;
++ av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
++ return 0;
++}
++#endif
++
++
+ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+- const AVFrame *frame, int *got_packet)
++ const AVFrame *src_frame, int *got_packet)
+ {
+- int ret = av_image_get_buffer_size(frame->format,
+- frame->width, frame->height, 1);
++ int ret;
++ AVFrame * frame = NULL;
+
+- if (ret < 0)
++#if CONFIG_SAND
++ if (av_rpi_is_sand_frame(src_frame)) {
++ ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
++ av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
++ av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
++ *got_packet = (ret == 0);
+ return ret;
++ }
++#endif
++
++ if ((frame = av_frame_clone(src_frame)) == NULL) {
++ ret = AVERROR(ENOMEM);
++ goto fail;
++ }
++
++ if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
++ goto fail;
++
++ ret = av_image_get_buffer_size(frame->format,
++ frame->width, frame->height, 1);
++ if (ret < 0)
++ goto fail;
+
+ if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
+- return ret;
++ goto fail;
+ if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+ (const uint8_t **)frame->data, frame->linesize,
+ frame->format,
+ frame->width, frame->height, 1)) < 0)
+- return ret;
++ goto fail;
+
+ if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
+ frame->format == AV_PIX_FMT_YUYV422) {
+@@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *av
+ }
+ }
+ pkt->flags |= AV_PKT_FLAG_KEY;
++ av_frame_free(&frame);
+ *got_packet = 1;
+ return 0;
++
++fail:
++ av_frame_free(&frame);
++ *got_packet = 0;
++ return ret;
+ }
+
+ AVCodec ff_rawvideo_encoder = {
+--- /dev/null
++++ b/libavcodec/rpi_hevc_cabac.c
+@@ -0,0 +1,2257 @@
++/*
++ * HEVC CABAC decoding
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#define UNCHECKED_BITSTREAM_READER 1
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++
++#include "cabac_functions.h"
++#include "rpi_hevc_data.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "rpi_hevc_cabac_fns.h"
++
++#include "libavutil/rpi_sand_fns.h"
++
++// BY22 is probably faster than simple bypass if the processor has
++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
++// x86 has fast int divide
++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
++// Use native divide if we have a fast one - otherwise use mpy 1/x
++// x86 has a fast integer divide - arm doesn't - unsure about other
++// architectures
++#define USE_BY22_DIV ARCH_X86
++
++// Special case blocks with a single significant ceoff
++// Decreases the complexity of the code for a common case but increases the
++// code size.
++#define USE_N_END_1 1
++
++#if !USE_BY22_DIV
++// * 1/x @ 32 bits gets us 22 bits of accuracy
++#define CABAC_BY22_PEEK_BITS 22
++#else
++// A real 32-bit divide gets us another bit
++// If we have a 64 bit int & a unit time divider then we should get a lot
++// of bits (55) but that is untested and it is unclear if it would give
++// us a large advantage
++#define CABAC_BY22_PEEK_BITS 23
++#endif
++
++#define CABAC_MAX_BIN 31
++
++
++#if USE_BY22 && !USE_BY22_DIV
++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
++
++static const uint32_t cabac_by22_inv_range[256] = {
++ 0, I(257), I(258), I(259),
++ I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
++ I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
++ I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
++ I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
++ I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
++ I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
++ I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
++ I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
++ I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
++ I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
++ I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
++ I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
++ I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
++ I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
++ I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
++ I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
++ I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
++ I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
++ I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
++ I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
++ I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
++ I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
++ I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
++ I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
++ I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
++ I(510), I(511)
++};
++#undef I
++#endif // USE_BY22
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_cabac.h"
++#endif
++
++/**
++ * number of bin by SyntaxElement.
++ */
++static const int8_t num_bins_in_se[] = {
++ 1, // sao_merge_flag
++ 1, // sao_type_idx
++ 0, // sao_eo_class
++ 0, // sao_band_position
++ 0, // sao_offset_abs
++ 0, // sao_offset_sign
++ 0, // end_of_slice_flag
++ 3, // split_coding_unit_flag
++ 1, // cu_transquant_bypass_flag
++ 3, // skip_flag
++ 3, // cu_qp_delta
++ 1, // pred_mode
++ 4, // part_mode
++ 0, // pcm_flag
++ 1, // prev_intra_luma_pred_mode
++ 0, // mpm_idx
++ 0, // rem_intra_luma_pred_mode
++ 2, // intra_chroma_pred_mode
++ 1, // merge_flag
++ 1, // merge_idx
++ 5, // inter_pred_idc
++ 2, // ref_idx_l0
++ 2, // ref_idx_l1
++ 2, // abs_mvd_greater0_flag
++ 2, // abs_mvd_greater1_flag
++ 0, // abs_mvd_minus2
++ 0, // mvd_sign_flag
++ 1, // mvp_lx_flag
++ 1, // no_residual_data_flag
++ 3, // split_transform_flag
++ 2, // cbf_luma
++ 4, // cbf_cb, cbf_cr
++ 2, // transform_skip_flag[][]
++ 2, // explicit_rdpcm_flag[][]
++ 2, // explicit_rdpcm_dir_flag[][]
++ 18, // last_significant_coeff_x_prefix
++ 18, // last_significant_coeff_y_prefix
++ 0, // last_significant_coeff_x_suffix
++ 0, // last_significant_coeff_y_suffix
++ 4, // significant_coeff_group_flag
++ 44, // significant_coeff_flag
++ 24, // coeff_abs_level_greater1_flag
++ 6, // coeff_abs_level_greater2_flag
++ 0, // coeff_abs_level_remaining
++ 0, // coeff_sign_flag
++ 8, // log2_res_scale_abs
++ 2, // res_scale_sign_flag
++ 1, // cu_chroma_qp_offset_flag
++ 1, // cu_chroma_qp_offset_idx
++};
++
++/**
++ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
++ */
++static const int elem_offset[sizeof(num_bins_in_se)] = {
++ 0, // sao_merge_flag
++ 1, // sao_type_idx
++ 2, // sao_eo_class
++ 2, // sao_band_position
++ 2, // sao_offset_abs
++ 2, // sao_offset_sign
++ 2, // end_of_slice_flag
++ 2, // split_coding_unit_flag
++ 5, // cu_transquant_bypass_flag
++ 6, // skip_flag
++ 9, // cu_qp_delta
++ 12, // pred_mode
++ 13, // part_mode
++ 17, // pcm_flag
++ 17, // prev_intra_luma_pred_mode
++ 18, // mpm_idx
++ 18, // rem_intra_luma_pred_mode
++ 18, // intra_chroma_pred_mode
++ 20, // merge_flag
++ 21, // merge_idx
++ 22, // inter_pred_idc
++ 27, // ref_idx_l0
++ 29, // ref_idx_l1
++ 31, // abs_mvd_greater0_flag
++ 33, // abs_mvd_greater1_flag
++ 35, // abs_mvd_minus2
++ 35, // mvd_sign_flag
++ 35, // mvp_lx_flag
++ 36, // no_residual_data_flag
++ 37, // split_transform_flag
++ 40, // cbf_luma
++ 42, // cbf_cb, cbf_cr
++ 46, // transform_skip_flag[][]
++ 48, // explicit_rdpcm_flag[][]
++ 50, // explicit_rdpcm_dir_flag[][]
++ 52, // last_significant_coeff_x_prefix
++ 70, // last_significant_coeff_y_prefix
++ 88, // last_significant_coeff_x_suffix
++ 88, // last_significant_coeff_y_suffix
++ 88, // significant_coeff_group_flag
++ 92, // significant_coeff_flag
++ 136, // coeff_abs_level_greater1_flag
++ 160, // coeff_abs_level_greater2_flag
++ 166, // coeff_abs_level_remaining
++ 166, // coeff_sign_flag
++ 166, // log2_res_scale_abs
++ 174, // res_scale_sign_flag
++ 176, // cu_chroma_qp_offset_flag
++ 177, // cu_chroma_qp_offset_idx
++};
++
++#define CNU 154
++/**
++ * Indexed by init_type
++ */
++static const uint8_t init_values[3][HEVC_CONTEXTS] = {
++ { // sao_merge_flag
++ 153,
++ // sao_type_idx
++ 200,
++ // split_coding_unit_flag
++ 139, 141, 157,
++ // cu_transquant_bypass_flag
++ 154,
++ // skip_flag
++ CNU, CNU, CNU,
++ // cu_qp_delta
++ 154, 154, 154,
++ // pred_mode
++ CNU,
++ // part_mode
++ 184, CNU, CNU, CNU,
++ // prev_intra_luma_pred_mode
++ 184,
++ // intra_chroma_pred_mode
++ 63, 139,
++ // merge_flag
++ CNU,
++ // merge_idx
++ CNU,
++ // inter_pred_idc
++ CNU, CNU, CNU, CNU, CNU,
++ // ref_idx_l0
++ CNU, CNU,
++ // ref_idx_l1
++ CNU, CNU,
++ // abs_mvd_greater1_flag
++ CNU, CNU,
++ // abs_mvd_greater1_flag
++ CNU, CNU,
++ // mvp_lx_flag
++ CNU,
++ // no_residual_data_flag
++ CNU,
++ // split_transform_flag
++ 153, 138, 138,
++ // cbf_luma
++ 111, 141,
++ // cbf_cb, cbf_cr
++ 94, 138, 182, 154,
++ // transform_skip_flag
++ 139, 139,
++ // explicit_rdpcm_flag
++ 139, 139,
++ // explicit_rdpcm_dir_flag
++ 139, 139,
++ // last_significant_coeff_x_prefix
++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++ 79, 108, 123, 63,
++ // last_significant_coeff_y_prefix
++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++ 79, 108, 123, 63,
++ // significant_coeff_group_flag
++ 91, 171, 134, 141,
++ // significant_coeff_flag
++ 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153,
++ 125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
++ 139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
++ 141, 111,
++ // coeff_abs_level_greater1_flag
++ 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92, 139, 107,
++ 122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
++ // coeff_abs_level_greater2_flag
++ 138, 153, 136, 167, 152, 152,
++ // log2_res_scale_abs
++ 154, 154, 154, 154, 154, 154, 154, 154,
++ // res_scale_sign_flag
++ 154, 154,
++ // cu_chroma_qp_offset_flag
++ 154,
++ // cu_chroma_qp_offset_idx
++ 154,
++ },
++ { // sao_merge_flag
++ 153,
++ // sao_type_idx
++ 185,
++ // split_coding_unit_flag
++ 107, 139, 126,
++ // cu_transquant_bypass_flag
++ 154,
++ // skip_flag
++ 197, 185, 201,
++ // cu_qp_delta
++ 154, 154, 154,
++ // pred_mode
++ 149,
++ // part_mode
++ 154, 139, 154, 154,
++ // prev_intra_luma_pred_mode
++ 154,
++ // intra_chroma_pred_mode
++ 152, 139,
++ // merge_flag
++ 110,
++ // merge_idx
++ 122,
++ // inter_pred_idc
++ 95, 79, 63, 31, 31,
++ // ref_idx_l0
++ 153, 153,
++ // ref_idx_l1
++ 153, 153,
++ // abs_mvd_greater1_flag
++ 140, 198,
++ // abs_mvd_greater1_flag
++ 140, 198,
++ // mvp_lx_flag
++ 168,
++ // no_residual_data_flag
++ 79,
++ // split_transform_flag
++ 124, 138, 94,
++ // cbf_luma
++ 153, 111,
++ // cbf_cb, cbf_cr
++ 149, 107, 167, 154,
++ // transform_skip_flag
++ 139, 139,
++ // explicit_rdpcm_flag
++ 139, 139,
++ // explicit_rdpcm_dir_flag
++ 139, 139,
++ // last_significant_coeff_x_prefix
++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95,
++ 94, 108, 123, 108,
++ // last_significant_coeff_y_prefix
++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95,
++ 94, 108, 123, 108,
++ // significant_coeff_group_flag
++ 121, 140, 61, 154,
++ // significant_coeff_flag
++ 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153,
++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++ 153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
++ 140, 140,
++ // coeff_abs_level_greater1_flag
++ 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++ 136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
++ // coeff_abs_level_greater2_flag
++ 107, 167, 91, 122, 107, 167,
++ // log2_res_scale_abs
++ 154, 154, 154, 154, 154, 154, 154, 154,
++ // res_scale_sign_flag
++ 154, 154,
++ // cu_chroma_qp_offset_flag
++ 154,
++ // cu_chroma_qp_offset_idx
++ 154,
++ },
++ { // sao_merge_flag
++ 153,
++ // sao_type_idx
++ 160,
++ // split_coding_unit_flag
++ 107, 139, 126,
++ // cu_transquant_bypass_flag
++ 154,
++ // skip_flag
++ 197, 185, 201,
++ // cu_qp_delta
++ 154, 154, 154,
++ // pred_mode
++ 134,
++ // part_mode
++ 154, 139, 154, 154,
++ // prev_intra_luma_pred_mode
++ 183,
++ // intra_chroma_pred_mode
++ 152, 139,
++ // merge_flag
++ 154,
++ // merge_idx
++ 137,
++ // inter_pred_idc
++ 95, 79, 63, 31, 31,
++ // ref_idx_l0
++ 153, 153,
++ // ref_idx_l1
++ 153, 153,
++ // abs_mvd_greater1_flag
++ 169, 198,
++ // abs_mvd_greater1_flag
++ 169, 198,
++ // mvp_lx_flag
++ 168,
++ // no_residual_data_flag
++ 79,
++ // split_transform_flag
++ 224, 167, 122,
++ // cbf_luma
++ 153, 111,
++ // cbf_cb, cbf_cr
++ 149, 92, 167, 154,
++ // transform_skip_flag
++ 139, 139,
++ // explicit_rdpcm_flag
++ 139, 139,
++ // explicit_rdpcm_dir_flag
++ 139, 139,
++ // last_significant_coeff_x_prefix
++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111,
++ 79, 108, 123, 93,
++ // last_significant_coeff_y_prefix
++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111,
++ 79, 108, 123, 93,
++ // significant_coeff_group_flag
++ 121, 140, 61, 154,
++ // significant_coeff_flag
++ 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153,
++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++ 153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
++ 140, 140,
++ // coeff_abs_level_greater1_flag
++ 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++ 136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
++ // coeff_abs_level_greater2_flag
++ 107, 167, 91, 107, 107, 167,
++ // log2_res_scale_abs
++ 154, 154, 154, 154, 154, 154, 154, 154,
++ // res_scale_sign_flag
++ 154, 154,
++ // cu_chroma_qp_offset_flag
++ 154,
++ // cu_chroma_qp_offset_idx
++ 154,
++ },
++};
++
++static const uint8_t scan_1x1[1] = {
++ 0,
++};
++
++static const uint8_t horiz_scan2x2_x[4] = {
++ 0, 1, 0, 1,
++};
++
++static const uint8_t horiz_scan2x2_y[4] = {
++ 0, 0, 1, 1
++};
++
++static const uint8_t horiz_scan4x4_x[16] = {
++ 0, 1, 2, 3,
++ 0, 1, 2, 3,
++ 0, 1, 2, 3,
++ 0, 1, 2, 3,
++};
++
++static const uint8_t horiz_scan4x4_y[16] = {
++ 0, 0, 0, 0,
++ 1, 1, 1, 1,
++ 2, 2, 2, 2,
++ 3, 3, 3, 3,
++};
++
++static const uint8_t horiz_scan8x8_inv[8][8] = {
++ { 0, 1, 2, 3, 16, 17, 18, 19, },
++ { 4, 5, 6, 7, 20, 21, 22, 23, },
++ { 8, 9, 10, 11, 24, 25, 26, 27, },
++ { 12, 13, 14, 15, 28, 29, 30, 31, },
++ { 32, 33, 34, 35, 48, 49, 50, 51, },
++ { 36, 37, 38, 39, 52, 53, 54, 55, },
++ { 40, 41, 42, 43, 56, 57, 58, 59, },
++ { 44, 45, 46, 47, 60, 61, 62, 63, },
++};
++
++static const uint8_t diag_scan2x2_x[4] = {
++ 0, 0, 1, 1,
++};
++
++static const uint8_t diag_scan2x2_y[4] = {
++ 0, 1, 0, 1,
++};
++
++static const uint8_t diag_scan2x2_inv[2][2] = {
++ { 0, 2, },
++ { 1, 3, },
++};
++
++static const uint8_t diag_scan4x4_inv[4][4] = {
++ { 0, 2, 5, 9, },
++ { 1, 4, 8, 12, },
++ { 3, 7, 11, 14, },
++ { 6, 10, 13, 15, },
++};
++
++static const uint8_t diag_scan8x8_inv[8][8] = {
++ { 0, 2, 5, 9, 14, 20, 27, 35, },
++ { 1, 4, 8, 13, 19, 26, 34, 42, },
++ { 3, 7, 12, 18, 25, 33, 41, 48, },
++ { 6, 11, 17, 24, 32, 40, 47, 53, },
++ { 10, 16, 23, 31, 39, 46, 52, 57, },
++ { 15, 22, 30, 38, 45, 51, 56, 60, },
++ { 21, 29, 37, 44, 50, 55, 59, 62, },
++ { 28, 36, 43, 49, 54, 58, 61, 63, },
++};
++
++
++typedef struct
++{
++ uint16_t coeff;
++ uint16_t scale;
++} xy_off_t;
++
++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
++
++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
++
++#define OFF_DIAG(t) {\
++ XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
++ XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
++ XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
++ XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++#define OFF_HORIZ(t) {\
++ XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
++ XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
++ XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
++ XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
++}
++
++#define OFF_VERT(t) {\
++ XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
++ XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
++ XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
++ XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++static const xy_off_t off_xys[3][4][16] =
++{
++ {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
++ {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
++ {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
++};
++
++
++// Helper fns
++#ifndef hevc_mem_bits32
++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
++{
++ return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
++}
++#endif
++
++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
++#define hevc_clz32 hevc_clz32_builtin
++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
++{
++ // __builtin_clz says it works on ints - so adjust if int is >32 bits long
++ return __builtin_clz(x) - (sizeof(int) * 8 - 32);
++}
++#endif
++
++// It is unlikely that we will ever need this but include for completeness
++#ifndef hevc_clz32
++static inline unsigned int hevc_clz32(unsigned int x)
++{
++ unsigned int n = 1;
++ if ((x & 0xffff0000) == 0) {
++ n += 16;
++ x <<= 16;
++ }
++ if ((x & 0xff000000) == 0) {
++ n += 8;
++ x <<= 8;
++ }
++ if ((x & 0xf0000000) == 0) {
++ n += 4;
++ x <<= 4;
++ }
++ if ((x & 0xc0000000) == 0) {
++ n += 2;
++ x <<= 2;
++ }
++ return n - ((x >> 31) & 1);
++}
++#endif
++
++static inline int cabac_overflow(const CABACContext * const cc)
++{
++ av_assert0(cc->bytestream >= cc->bytestream_start);
++ return cc->bytestream >= cc->bytestream_end + 4;
++}
++
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
++{
++ return cabac_overflow(&lc->cc);
++}
++
++#if !USE_BY22
++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
++// will no longer be called but the setup calls will still exist and we want
++// to null them out
++#define bypass_start(s)
++#define bypass_finish(s)
++#else
++// Use BY22 for residual bypass block
++
++#define bypass_start(cc) get_cabac_by22_start(cc)
++#define bypass_finish(cc) get_cabac_by22_finish(cc)
++
++// BY22 notes that bypass is simply a divide into the bitstream and so we
++// can peek out large quantities of bits at once and treat the result as if
++// it was VLC. In many cases this will lead to O(1) processing rather than
++// O(n) though the setup and teardown is sufficiently expensive that it is
++// only worth using if we expect to be dealing with more than a few bits
++// The definition of "a few bits" will vary from platform to platform but
++// tests on ARM show that it probably isn't worth it for a single coded
++// residual, but is for >1 - it also seems likely that if there are
++// more residuals then they are likely to be bigger and this will make the
++// O(1) nature of the code more worthwhile.
++
++
++// Bypass block start
++// Must be called before _by22_peek is used as it sets the CABAC environment
++// into the correct state. _by22_finish must be called to return to 'normal'
++// (i.e. non-bypass) cabac decoding
++#ifndef get_cabac_by22_start
++static inline void get_cabac_by22_start(CABACContext * const c)
++{
++ const unsigned int bits = __builtin_ctz(c->low);
++ const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
++ uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
++#if !USE_BY22_DIV
++ const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
++#endif
++
++ c->bytestream -= (CABAC_BITS / 8);
++ c->by22.bits = bits;
++#if !USE_BY22_DIV
++ c->by22.range = c->range;
++ c->range = inv;
++#endif
++ c->low = x;
++}
++#endif
++
++// Bypass block finish
++// Must be called at the end of the bypass block to return to normal operation
++static inline void get_cabac_by22_finish(CABACContext * const c)
++{
++ unsigned int used = c->by22.bits;
++ unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
++ unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
++
++ c->bytestream += bytes_used + (CABAC_BITS / 8);
++ c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
++#if !USE_BY22_DIV
++ c->range = c->by22.range;
++#endif
++}
++
++// Peek bypass bits
++// _by22_start must be called before _by22_peek is called and _by22_flush
++// must be called afterwards to flush any used bits
++// The actual number of valid bits returned is
++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
++// will be at least 22 which should be long enough for any prefix or suffix
++// though probably not long enough for the worst case combination
++#ifndef get_cabac_by22_peek
++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
++{
++#if USE_BY22_DIV
++ return ((unsigned int)c->low / (unsigned int)c->range) << 9;
++#else
++ uint32_t x = c->low & ~1U;
++ const uint32_t inv = c->range;
++
++ if (inv != 0)
++ x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
++
++ return x << 1;
++#endif
++}
++#endif
++
++// Flush bypass bits peeked by _by22_peek
++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
++// val is an unmodified copy of whatever _by22_peek returned
++#ifndef get_cabac_by22_flush
++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
++{
++ // Subtract the bits used & reshift up to the top of the word
++#if USE_BY22_DIV
++ const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
++#else
++ const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
++#endif
++
++ // and refill lower bits
++ // We will probably OR over some existing bits but that doesn't matter
++ c->by22.bits += n;
++ c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
++}
++#endif
++
++#endif // USE_BY22
++
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
++{
++ memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
++ memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
++}
++
++static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
++ memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
++}
++
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
++{
++ GetBitContext * const gb = &lc->gb;
++ skip_bits(gb, 1);
++ align_get_bits(gb);
++ return ff_init_cabac_decoder(&lc->cc,
++ gb->buffer + get_bits_count(gb) / 8,
++ (get_bits_left(gb) + 7) / 8);
++}
++
++static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ int init_type = 2 - s->sh.slice_type;
++ int i;
++
++ if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
++ init_type ^= 3;
++
++ for (i = 0; i < HEVC_CONTEXTS; i++) {
++ int init_value = init_values[init_type][i];
++ int m = (init_value >> 4) * 5 - 45;
++ int n = ((init_value & 15) << 3) - 16;
++ int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
++
++ pre ^= pre >> 31;
++ if (pre > 124)
++ pre = 124 + (pre & 1);
++ lc->cabac_state[i] = pre;
++ }
++
++ for (i = 0; i < 4; i++)
++ lc->stat_coeff[i] = 0;
++}
++
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
++{
++ if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
++ {
++ lc->qPy_pred = s->sh.slice_qp;
++ cabac_init_state(s, lc);
++ }
++ else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
++ {
++ lc->qPy_pred = s->sh.slice_qp;
++ load_states(s, lc);
++ }
++ lc->cabac_init_req = 0;
++}
++
++#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
++
++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
++{
++ return get_cabac_inline(c, state);
++}
++
++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
++{
++ return get_cabac_terminate(c);
++}
++
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
++{
++ if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
++ return 0;
++
++ if (!get_cabac_bypass(&lc->cc))
++ return SAO_BAND;
++ return SAO_EDGE;
++}
++
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
++{
++ int i;
++ int value = get_cabac_bypass(&lc->cc);
++
++ for (i = 0; i < 4; i++)
++ value = (value << 1) | get_cabac_bypass(&lc->cc);
++ return value;
++}
++
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ int i = 0;
++ int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
++
++ while (i < length && get_cabac_bypass(&lc->cc))
++ i++;
++ return i;
++}
++
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
++{
++ return get_cabac_bypass(&lc->cc);
++}
++
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
++{
++ int ret = get_cabac_bypass(&lc->cc) << 1;
++ ret |= get_cabac_bypass(&lc->cc);
++ return ret;
++}
++
++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
++{
++ int val = 1;
++
++ if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
++ return 0;
++
++ while (val < 5 &&
++ get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
++ val++;
++
++ if (val >= 5) {
++ unsigned int k = 0;
++ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
++ val += 1 << k;
++ k++;
++ }
++// if (k == CABAC_MAX_BIN)
++// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++
++ while (k--)
++ val += get_cabac_bypass(&lc->cc) << k;
++ }
++ return get_cabac_bypass(&lc->cc) ? -val : val;
++}
++
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
++ int i = 0;
++
++ while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
++ i++;
++
++ return i;
++}
++
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
++{
++ if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
++ return PART_2Nx2N;
++ if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
++ if (lc->cu.pred_mode == MODE_INTRA) // 0
++ return PART_NxN;
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
++ return PART_2NxN;
++ if (log2_cb_size == 3) // 00
++ return PART_Nx2N;
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
++ return PART_Nx2N;
++ return PART_NxN; // 000
++ }
++
++ if (!s->ps.sps->amp_enabled_flag) {
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
++ return PART_2NxN;
++ return PART_Nx2N;
++ }
++
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
++ return PART_2NxN;
++ if (get_cabac_bypass(&lc->cc)) // 0101
++ return PART_2NxnD;
++ return PART_2NxnU; // 0100
++ }
++
++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
++ return PART_Nx2N;
++ if (get_cabac_bypass(&lc->cc)) // 0001
++ return PART_nRx2N;
++ return PART_nLx2N; // 0000
++}
++
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
++{
++ int i = 0;
++ while (i < 2 && get_cabac_bypass(&lc->cc))
++ i++;
++ return i;
++}
++
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++ int i;
++ int value = get_cabac_bypass(&lc->cc);
++
++ for (i = 0; i < 4; i++)
++ value = (value << 1) | get_cabac_bypass(&lc->cc);
++ return value;
++}
++
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++ int ret;
++ if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
++ return 4;
++
++ ret = get_cabac_bypass(&lc->cc) << 1;
++ ret |= get_cabac_bypass(&lc->cc);
++ return ret;
++}
++
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
++
++ if (i != 0) {
++ while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
++ i++;
++ }
++ return i;
++}
++
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
++{
++ if (nPbW + nPbH == 12)
++ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
++ if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
++ return PRED_BI;
++
++ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
++}
++
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
++{
++ int i = 0;
++ int max = num_ref_idx_lx - 1;
++ int max_ctx = FFMIN(max, 2);
++
++ while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
++ i++;
++ if (i == 2) {
++ while (i < max && get_cabac_bypass(&lc->cc))
++ i++;
++ }
++
++ return i;
++}
++
++static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
++}
++
++static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
++}
++
++#if !USE_BY22
++static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
++{
++ int ret = 2;
++ int k = 1;
++
++ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
++ ret += 1U << k;
++ k++;
++ }
++ if (k == CABAC_MAX_BIN) {
++ av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++ return 0;
++ }
++
++ while (k--)
++ ret += get_cabac_bypass(&lc->cc) << k;
++ return get_cabac_bypass_sign(&lc->cc, -ret);
++}
++#endif
++
++static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return get_cabac_bypass_sign(&lc->cc, -1);
++}
++
++static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++ return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
++}
++
++static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
++}
++
++static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
++}
++
++
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
++ int i =0;
++
++ while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
++ i++;
++
++ return i;
++}
++
++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
++ int log2_size, int *last_scx_prefix, int *last_scy_prefix)
++{
++ int i = 0;
++ int max = (log2_size << 1) - 1;
++ int ctx_offset, ctx_shift;
++
++ if (!c_idx_nz) {
++ ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2);
++ ctx_shift = (log2_size + 1) >> 2;
++ } else {
++ ctx_offset = 15;
++ ctx_shift = log2_size - 2;
++ }
++ while (i < max &&
++ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
++ i++;
++ *last_scx_prefix = i;
++
++ i = 0;
++ while (i < max &&
++ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
++ i++;
++ *last_scy_prefix = i;
++}
++
++static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
++ int last_significant_coeff_prefix)
++{
++ int i;
++ int length = (last_significant_coeff_prefix >> 1) - 1;
++ int value = get_cabac_bypass(&lc->cc);
++
++ for (i = 1; i < length; i++)
++ value = (value << 1) | get_cabac_bypass(&lc->cc);
++ return value;
++}
++
++static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
++{
++ int inc;
++
++ inc = (ctx_cg != 0) + (c_idx_nz << 1);
++
++ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
++}
++
++static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
++{
++ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
++}
++
++#if !USE_BY22
++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
++#endif
++
++
++#ifndef coeff_abs_level_remaining_decode_bypass
++static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
++{
++ uint32_t y;
++ unsigned int prefix;
++ unsigned int last_coeff_abs_level_remaining;
++ unsigned int n;
++
++ y = get_cabac_by22_peek(c);
++ prefix = hevc_clz32(~y);
++ // y << prefix will always have top bit 0
++
++ if (prefix < 3) {
++ const unsigned int suffix = (y << prefix) >> (31 - rice_param);
++ last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
++ n = prefix + 1 + rice_param;
++ }
++ else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
++ {
++ const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
++
++ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++ n = prefix * 2 + rice_param - 2;
++ }
++ else {
++ unsigned int suffix;
++
++ get_cabac_by22_flush(c, prefix, y);
++ y = get_cabac_by22_peek(c);
++
++ suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
++ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++ n = prefix + rice_param - 2;
++ }
++
++ get_cabac_by22_flush(c, n, y);
++
++ return last_coeff_abs_level_remaining;
++}
++#endif
++
++static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
++{
++ int prefix = 0;
++ int suffix = 0;
++ int last_coeff_abs_level_remaining;
++ int i;
++
++ while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
++ prefix++;
++ if (prefix == CABAC_MAX_BIN) {
++// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
++ return 0;
++ }
++
++ if (prefix < 3) {
++ for (i = 0; i < rc_rice_param; i++)
++ suffix = (suffix << 1) | get_cabac_bypass(c);
++ last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
++ } else {
++ int prefix_minus3 = prefix - 3;
++ for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
++ suffix = (suffix << 1) | get_cabac_bypass(c);
++ last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
++ << rc_rice_param) + suffix;
++ }
++
++ return last_coeff_abs_level_remaining;
++}
++
++#if !USE_BY22
++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
++static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
++{
++ unsigned int i;
++ uint32_t ret = 0;
++
++ for (i = 0; i < nb; i++)
++ ret = (ret << 1) | get_cabac_bypass(c);
++
++ return ret << (32 - nb);
++}
++#endif
++
++#ifndef coeff_sign_flag_decode_bypass
++static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
++{
++ uint32_t y;
++ y = get_cabac_by22_peek(c);
++ get_cabac_by22_flush(c, nb, y);
++ return y & ~(0xffffffffU >> nb);
++}
++#endif
++
++
++#ifndef get_cabac_greater1_bits
++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
++ uint8_t * const state0)
++{
++ unsigned int i;
++ unsigned int rv = 0;
++ for (i = 0; i != n; ++i) {
++ const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
++ const unsigned int b = get_cabac(c, state0 + idx);
++ rv = (rv << 1) | b;
++ }
++ return rv;
++}
++#endif
++
++
++// N.B. levels returned are the values assuming coeff_abs_level_remaining
++// is uncoded, so 1 must be added if it is coded. sum_abs also reflects
++// this version of events.
++static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
++ int * const pprev_subset_coded, int * const psum,
++ const unsigned int idx0_gt1, const unsigned int idx_gt2)
++{
++ CABACContext * const c = &lc->cc;
++ uint8_t * const state0 = lc->cabac_state + idx0_gt1;
++ uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
++ unsigned int rv;
++ unsigned int i;
++ const unsigned int n = FFMIN(n_end, 8);
++
++ // Really this is i != n but the simple unconditional loop is cheaper
++ // and faster
++ for (i = 0; i != 8; ++i)
++ levels[i] = 1;
++
++ rv = get_cabac_greater1_bits(c, n, state0);
++
++ *pprev_subset_coded = 0;
++ *psum = n;
++
++ rv <<= (32 - n);
++ if (rv != 0)
++ {
++ *pprev_subset_coded = 1;
++ *psum = n + 1;
++ i = hevc_clz32(rv);
++ levels[i] = 2;
++ if (get_cabac(c, state_gt2) == 0)
++ {
++ // Unset first coded bit
++ rv &= ~(0x80000000U >> i);
++ }
++ }
++
++ if (n_end > 8) {
++ const unsigned int g8 = n_end - 8;
++ rv |= ((1 << g8) - 1) << (24 - g8);
++ for (i = 0; i != g8; ++i) {
++ levels[i + 8] = 0;
++ }
++ }
++
++ return rv;
++}
++
++// extended_precision_processing_flag must be false given we are
++// putting the result into a 16-bit array
++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
++// scale_m is uint8_t
++//
++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
++// or it can be 2 (if we have transquant_bypass)
++// shift is set to one less than we really want but would normally be
++// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
++// to achieve it
++
++#ifndef trans_scale_sat
++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++ return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
++}
++#endif
++
++
++#ifndef update_rice
++static inline void update_rice(uint8_t * const stat_coeff,
++ const unsigned int last_coeff_abs_level_remaining,
++ const unsigned int c_rice_param)
++{
++ const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
++ if (x >= 6)
++ (*stat_coeff)++;
++ else if (x == 0 && *stat_coeff > 0)
++ (*stat_coeff)--;
++}
++#endif
++
++
++// n must be > 0 on entry
++#ifndef get_cabac_sig_coeff_flag_idxs
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++ unsigned int n,
++ const uint8_t const * ctx_map,
++ uint8_t * p)
++{
++ do {
++ if (get_cabac(c, state0 + ctx_map[n]))
++ *p++ = n;
++ } while (--n != 0);
++ return p;
++}
++#endif
++
++
++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++ unsigned int n,
++ const uint8_t * ctx_map, // const ptr here but not in asm
++ uint8_t * const flag_idx)
++{
++ int rv;
++
++ rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
++
++ return rv;
++}
++
++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++ x0, x1, x2, x3,\
++ x4, x5, x6, x7,\
++ x8, x9, x10, x11,\
++ x12, x13, x14, x15}
++
++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++ x0, x4, x8, x12,\
++ x1, x5, x9, x13,\
++ x2, x6, x10, x14,\
++ x3, x7, x11, x15}
++
++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++ x0, x4, x1, x8,\
++ x5, x2, x12, x9,\
++ x6, x3, x13, x10,\
++ x7, x14, x11, x15}
++
++
++static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
++ uint8_t * const significant_coeff_group_flag,
++ const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
++ int * const pPrev_sig)
++{
++ while (--i >= 0) {
++ uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
++ const unsigned int x_cg = scan_x_cg[i];
++
++ // For the flag decode we only care about Z/NZ but
++ // we use the full Right * 2 + Down when calculating
++ // significant coeff flags so we obtain it here.
++ //
++ // The group flag array is one longer than it needs to
++ // be so we don't need to check for y_cg limits
++ const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
++
++ if (i == 0 ||
++ significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
++ {
++ gf_y[0] |= (1 << x_cg);
++ *pPrev_sig = prev_sig;
++ break;
++ }
++ }
++
++ return i;
++}
++
++static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++ const unsigned int log2_trafo_size, const unsigned int c_idx,
++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++ const AVFrame * const frame = s->frame;
++ const unsigned int stride = frame_stride1(s->frame, c_idx);
++ const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++ const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++ const int is_sliced = 1; // av_rpi_is_sand_frame(frame);
++ uint8_t * const dst = !is_sliced ?
++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(frame, x, y) :
++ av_rpi_sand_frame_pos_c(frame, x, y);
++
++ const unsigned int i = jb->intra.n;
++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++
++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++ pc->ta.dst == dst)
++ {
++ av_assert1(pc->size == log2_trafo_size &&
++ pc->c_idx == 1 &&
++ pc->ta.stride == stride);
++
++ pc->type = RPI_PRED_ADD_RESIDUAL_C;
++ }
++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++ pc->dc.dst == dst)
++ {
++ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits
++ av_assert1(pc->size == log2_trafo_size &&
++ pc->c_idx == 1 &&
++ pc->dc.stride == stride);
++
++ // Rewrite as add residual - must rewrite all fields as different union member
++ pc->type = RPI_PRED_ADD_RESIDUAL_V;
++ pc->ta.buf = coeffs;
++ pc->ta.dst = dst;
++ pc->ta.stride = stride;
++ pc->ta.dc = dc;
++ }
++ else
++ {
++ HEVCPredCmd * const cmd = pc + 1;
++ jb->intra.n = i + 1;
++
++ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++ cmd->size = log2_trafo_size;
++ cmd->ta.buf = coeffs;
++ cmd->ta.dst = dst;
++ cmd->ta.stride = stride;
++ cmd->ta.dc = 0;
++ }
++}
++
++
++static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const unsigned int log2_trafo_size, const unsigned int c_idx,
++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++ const AVFrame * const frame = s->frame;
++ const unsigned int stride = frame_stride1(s->frame, c_idx);
++ const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++ const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++ const int is_sliced = 1;
++ uint8_t * const dst = !is_sliced ?
++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(frame, x, y) :
++ av_rpi_sand_frame_pos_c(frame, x, y);
++
++ const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
++ const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
++
++ const unsigned int i = jb->intra.n;
++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++
++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++ pc->ta.dst == dst)
++ {
++ av_assert1(pc->size == log2_trafo_size &&
++ pc->c_idx == 1 &&
++ pc->ta.stride == stride);
++
++ pc->ta.dc = (int16_t)coeff;
++ }
++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++ pc->dc.dst == dst)
++ {
++ av_assert1(pc->size == log2_trafo_size &&
++ pc->c_idx == 1 &&
++ pc->dc.stride == stride &&
++ (pc->dc.dc & ~0xffff) == 0);
++
++ pc->dc.dc |= (coeff << 16);
++ }
++ else
++ {
++ HEVCPredCmd * const cmd = pc + 1;
++ jb->intra.n = i + 1;
++
++ cmd->type = RPI_PRED_ADD_DC + c_idx;
++ cmd->size = log2_trafo_size;
++ cmd->dc.dst = dst;
++ cmd->dc.stride = stride;
++ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
++ }
++}
++
++
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0,
++ const int log2_trafo_size, const enum ScanType scan_idx,
++ const int c_idx)
++{
++ int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
++
++ int last_significant_coeff_x, last_significant_coeff_y;
++ int num_coeff = 0;
++ int prev_subset_coded = 0;
++
++ int num_last_subset;
++ int x_cg_last_sig, y_cg_last_sig;
++
++ const uint8_t *scan_x_cg, *scan_y_cg;
++ const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++
++ int use_vpu;
++#if RPI_COMPRESS_COEFFS
++ int num_nonzero = 0;
++ int use_compress = 0;
++ int *coeffs32;
++#endif
++ int use_dc = 0;
++ int16_t *coeffs;
++ uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero
++ int explicit_rdpcm_flag = 0;
++ int explicit_rdpcm_dir_flag;
++
++ int i;
++ int shift,scale;
++ const uint8_t *scale_matrix = NULL;
++ uint8_t dc_scale;
++ const int c_idx_nz = (c_idx != 0);
++ const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
++ int prev_sig = 0;
++ int may_hide_sign;
++
++ int16_t dummy_coeffs[16];
++
++ // Derive QP for dequant
++ if (!lc->cu.cu_transquant_bypass_flag) {
++ may_hide_sign = s->ps.pps->sign_data_hiding_flag;
++
++ if (s->ps.pps->transform_skip_enabled_flag &&
++ log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
++ int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
++ if (transform_skip_flag) {
++ trans_skip_or_bypass = 1;
++ if (lc->cu.pred_mode == MODE_INTRA &&
++ s->ps.sps->implicit_rdpcm_enabled_flag &&
++ (pred_mode_intra == 10 || pred_mode_intra == 26)) {
++ may_hide_sign = 0;
++ }
++ }
++ }
++
++ {
++ static const uint8_t level_scale[8] = {
++ 40, 45, 51, 57, 64, 72, 0, 0 // Pad to 8
++ };
++ const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
++
++ // Shift is set to one less than will actually occur as the scale
++ // and saturate step adds 1 and then shifts right again
++ scale = level_scale[qp6 & 7];
++// shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
++ shift = log2_trafo_size - (qp6 >> 3);
++
++ if (shift < 0) {
++ scale <<= -shift;
++ shift = 0;
++ }
++ }
++
++ if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
++ const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
++ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++ const unsigned int matrix_id =
++ lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
++
++ scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
++ dc_scale = scale_matrix[0];
++ if (log2_trafo_size >= 4)
++ dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
++ }
++ else
++ {
++ static const uint8_t sixteen_scale[64] = {
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16,
++ 16, 16, 16, 16, 16, 16, 16, 16
++ };
++ scale_matrix = sixteen_scale;
++ dc_scale = 16;
++ }
++ } else {
++ static const uint8_t unit_scale[64] = {
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ 1, 1, 1, 1, 1, 1, 1, 1,
++ };
++ scale_matrix = unit_scale;
++ shift = 0;
++ scale = 2; // We will shift right to kill this
++ dc_scale = 1;
++
++ may_hide_sign = 0;
++ }
++
++
++
++
++ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
++ trans_skip_or_bypass) {
++ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
++ if (explicit_rdpcm_flag) {
++ may_hide_sign = 0;
++ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
++ }
++ }
++
++ last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
++ &last_significant_coeff_x, &last_significant_coeff_y);
++
++ if (last_significant_coeff_x > 3) {
++ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
++ last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
++ (2 + (last_significant_coeff_x & 1)) +
++ suffix;
++ }
++
++ if (last_significant_coeff_y > 3) {
++ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
++ last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
++ (2 + (last_significant_coeff_y & 1)) +
++ suffix;
++ }
++
++ if (scan_idx == SCAN_VERT)
++ FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
++
++ x_cg_last_sig = last_significant_coeff_x >> 2;
++ y_cg_last_sig = last_significant_coeff_y >> 2;
++
++ switch (scan_idx) {
++ case SCAN_DIAG: {
++ int last_x_c = last_significant_coeff_x & 3;
++ int last_y_c = last_significant_coeff_y & 3;
++
++ num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
++
++ switch (log2_trafo_size) {
++ case 2:
++ scan_x_cg = scan_1x1;
++ scan_y_cg = scan_1x1;
++ break;
++ case 3:
++ num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++ scan_x_cg = diag_scan2x2_x;
++ scan_y_cg = diag_scan2x2_y;
++ break;
++ case 4:
++ num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++ scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
++ scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
++ break;
++ case 5:
++ default:
++ num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++ scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
++ scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
++ break;
++ }
++ break;
++ }
++ case SCAN_HORIZ:
++ scan_x_cg = horiz_scan2x2_x;
++ scan_y_cg = horiz_scan2x2_y;
++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
++ break;
++ default: //SCAN_VERT
++ scan_x_cg = horiz_scan2x2_y;
++ scan_y_cg = horiz_scan2x2_x;
++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
++ break;
++ }
++ num_coeff++;
++ num_last_subset = (num_coeff - 1) >> 4;
++
++ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++
++ {
++ const unsigned int ccount = 1 << (log2_trafo_size * 2);
++ const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */; // These need special processing
++ use_vpu = 0;
++ use_dc = (num_coeff == 1) && !special &&
++ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
++
++ if (use_dc) {
++ // Just need a little empty space
++ coeffs = dummy_coeffs;
++ // No need to clear
++ }
++ else
++ {
++ use_vpu = !special && log2_trafo_size >= 4;
++#if RPI_COMPRESS_COEFFS
++ use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
++#endif
++ coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if RPI_COMPRESS_COEFFS
++ coeffs32 = (int*)coeffs;
++ if (!use_compress)
++#endif
++#if HAVE_NEON
++ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
++ memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++ }
++ }
++
++ i = num_last_subset;
++ do {
++ int implicit_non_zero_coeff = 0;
++ int n_end;
++
++ uint8_t significant_coeff_flag_idx[16];
++ unsigned int nb_significant_coeff_flag = 0;
++
++ if (i == num_last_subset) {
++ // First time through
++ int last_scan_pos = num_coeff - (i << 4) - 1;
++ n_end = last_scan_pos - 1;
++ significant_coeff_flag_idx[0] = last_scan_pos;
++ nb_significant_coeff_flag = 1;
++ } else {
++ n_end = 15;
++ implicit_non_zero_coeff = (i != 0);
++ }
++
++ if (n_end >= 0) {
++ static const uint8_t ctx_idx_maps_ts2[3][16] = {
++ D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++ H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++ V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2
++ };
++ // N.B. prev_sig = Right * 2 + Down
++ static const uint8_t ctx_idx_maps[3][4][16] = {
++ {
++ D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++ D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
++ },
++ {
++ H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++ H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
++ },
++ {
++ V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++ V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
++ }
++ };
++ const uint8_t *ctx_idx_map_p;
++ int scf_offset = 0;
++
++ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++ ctx_idx_map_p = ctx_idx_maps[0][3];
++ scf_offset = 40 + c_idx_nz;
++ } else {
++ if (c_idx_nz != 0)
++ scf_offset = 27;
++
++ if (log2_trafo_size == 2) {
++ ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
++ } else {
++ ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
++ if (!c_idx_nz) {
++ if (i != 0)
++ scf_offset += 3;
++
++ if (log2_trafo_size == 3) {
++ scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
++ } else {
++ scf_offset += 21;
++ }
++ } else {
++ if (log2_trafo_size == 3)
++ scf_offset += 9;
++ else
++ scf_offset += 12;
++ }
++ }
++ }
++
++ if (n_end > 0) {
++ int cnt = get_sig_coeff_flag_idxs(&lc->cc,
++ lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
++ n_end, ctx_idx_map_p,
++ significant_coeff_flag_idx + nb_significant_coeff_flag);
++
++ nb_significant_coeff_flag += cnt;
++ if (cnt != 0) {
++ implicit_non_zero_coeff = 0;
++ }
++ }
++
++ if (implicit_non_zero_coeff == 0) {
++ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++ scf_offset = 42 + c_idx_nz;
++ } else {
++ if (i == 0) {
++ scf_offset = c_idx_nz ? 27 : 0;
++ } else {
++ scf_offset = 2 + scf_offset;
++ }
++ }
++ if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++ nb_significant_coeff_flag++;
++ }
++ } else {
++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++ nb_significant_coeff_flag++;
++ }
++ }
++#if RPI_COMPRESS_COEFFS
++ if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
++ int16_t temp[32*32];
++ const unsigned int ccount = 1 << (log2_trafo_size * 2);
++ lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
++ lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
++ memcpy(temp, coeffs, sizeof(int)*num_nonzero);
++ coeffs32 = (int *)temp;
++ memset(coeffs, 0, ccount * sizeof(int16_t));
++ num_nonzero--;
++ while (num_nonzero >= 0) {
++ const unsigned int res = coeffs32[num_nonzero];
++ const unsigned int offset = res & 0xffff;
++ coeffs[ offset ] = res >> 16;
++ num_nonzero--;
++ }
++ use_compress = 0;
++ }
++#endif
++
++ if (nb_significant_coeff_flag != 0) {
++ const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
++ ((i != 0 && !c_idx_nz) ? 2 : 0) |
++ prev_subset_coded;
++ const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
++ (gt1_idx_delta << 2);
++ const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
++ gt1_idx_delta;
++
++ const unsigned int x_cg = scan_x_cg[i];
++ const unsigned int y_cg = scan_y_cg[i];
++ int16_t * const blk_coeffs = coeffs +
++ ((x_cg + (y_cg << log2_trafo_size)) << 2);
++ // This calculation is 'wrong' for log2_traffo_size == 2
++ // but that doesn't matter as in this case x_cg & y_cg
++ // are always 0 so result is correct (0) anyway
++ const uint8_t * const blk_scale = scale_matrix +
++ (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
++
++ // * The following code block doesn't deal with these flags:
++ // (nor did the one it replaces)
++ //
++ // cabac_bypass_alignment_enabled_flag
++ // This should be easy but I can't find a test case
++ // extended_precision_processing_flag
++ // This can extend the required precision past 16bits
++ // so is probably tricky - also no example found yet
++
++#if USE_N_END_1
++ if (nb_significant_coeff_flag == 1) {
++ // There is a small gain to be had from special casing the single
++ // transform coefficient case. The reduction in complexity
++ // makes up for the code duplicatioon.
++
++ int trans_coeff_level = 1;
++ int coeff_sign_flag;
++ int coded_val = 0;
++
++ // initialize first elem of coeff_bas_level_greater1_flag
++ prev_subset_coded = 0;
++
++ if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
++ trans_coeff_level = 2;
++ prev_subset_coded = 1;
++ coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
++ }
++
++ // Probably not worth the overhead of starting by22 for just one value
++ coeff_sign_flag = get_cabac_bypass(&lc->cc);
++
++ if (coded_val)
++ {
++ if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
++ trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
++ } else {
++ uint8_t * const stat_coeff =
++ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++ const unsigned int c_rice_param = *stat_coeff >> 2;
++ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
++
++ trans_coeff_level = 3 + last_coeff_abs_level_remaining;
++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++ }
++ }
++
++ {
++ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++ const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++ const unsigned int scale_m = blk_scale[xy_off->scale];
++ const int res = trans_scale_sat(
++ (trans_coeff_level ^ k) - k, // Apply sign
++ scale,
++ i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
++ shift);
++#if RPI_COMPRESS_COEFFS
++ if (use_compress)
++ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
++ else
++#endif
++ blk_coeffs[xy_off->coeff] = res;
++ }
++ }
++ else
++#endif
++ {
++ int sign_hidden = may_hide_sign;
++ int levels[16]; // Should be able to get away with int16_t but that fails some tests
++ uint32_t coeff_sign_flags;
++ uint32_t coded_vals = 0;
++ // Sum(abs(level[]))
++ // In fact we only need the bottom bit and in some future
++ // version that may be all we calculate
++ unsigned int sum_abs;
++
++ coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
++ &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
++
++ if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
++ sign_hidden = 0;
++
++ // -- Start bypass block
++
++ bypass_start(&lc->cc);
++
++ coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
++
++ if (coded_vals != 0)
++ {
++ const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
++ uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
++ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++ int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
++ int * level = levels - 1;
++
++ do {
++ {
++ const unsigned int z = hevc_clz32(coded_vals) + 1;
++ level += z;
++ coded_vals <<= z;
++ }
++
++ {
++ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
++ const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
++
++ sum_abs += last_coeff_abs_level_remaining + 1;
++ *level = trans_coeff_level;
++
++ if (stat_coeff != NULL)
++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++ stat_coeff = NULL;
++
++ if (trans_coeff_level > (3 << c_rice_param) &&
++ (c_rice_param < 4 || rice_adaptation_enabled))
++ ++c_rice_param;
++ }
++ } while (coded_vals != 0);
++ }
++
++ // sign_hidden = 0 or 1 so we can combine the tests
++ if ((sign_hidden & sum_abs) != 0) {
++ levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
++ }
++
++ bypass_finish(&lc->cc);
++
++ // -- Finish bypass block
++
++ // Scale loop
++ {
++ int m = nb_significant_coeff_flag - 1;
++
++ // Deal with DC component (if any) first
++ if (i == 0 && significant_coeff_flag_idx[m] == 0)
++ {
++ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++ const int res = trans_scale_sat(
++ (levels[m] ^ k) - k, scale, dc_scale, shift);
++#if RPI_COMPRESS_COEFFS
++ if (use_compress)
++ {
++ coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
++ }
++ else
++#endif
++ {
++ blk_coeffs[0] = res;
++ }
++ --m;
++ }
++
++#if !USE_N_END_1
++ // If N_END_1 set then m was at least 1 initially
++ if (m >= 0)
++#endif
++ {
++ do {
++ const xy_off_t * const xy_off = scan_xy_off +
++ significant_coeff_flag_idx[m];
++ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++ const int res = trans_scale_sat(
++ (levels[m] ^ k) - k,
++ scale,
++ blk_scale[xy_off->scale],
++ shift);
++#if RPI_COMPRESS_COEFFS
++ if (use_compress) {
++ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
++ } else
++#endif
++ blk_coeffs[xy_off->coeff] = res;
++ } while (--m >= 0);
++ }
++ }
++
++ }
++ }
++ } while ((i = next_subset(lc, i, c_idx_nz,
++ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
++ !cabac_overflow(&lc->cc));
++
++ if (lc->cu.cu_transquant_bypass_flag) {
++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++ (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++ int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
++
++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++ }
++ } else {
++ if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
++ int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
++ log2_trafo_size == 2 &&
++ lc->cu.pred_mode == MODE_INTRA;
++ if (rot) {
++ for (i = 0; i < 8; i++)
++ FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
++ }
++
++ s->hevcdsp.dequant(coeffs, log2_trafo_size);
++
++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++ lc->cu.pred_mode == MODE_INTRA &&
++ (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++ int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
++
++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++ }
++ } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
++ s->hevcdsp.transform_4x4_luma(coeffs);
++ }
++ else if (!use_vpu)
++ {
++ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++ if (max_xy == 0)
++ {
++ if (use_dc)
++ rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
++ else
++ s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
++ }
++ else {
++ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
++ if (max_xy < 4)
++ col_limit = FFMIN(4, col_limit);
++ else if (max_xy < 8)
++ col_limit = FFMIN(8, col_limit);
++ else if (max_xy < 12)
++ col_limit = FFMIN(24, col_limit);
++ s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
++ }
++ }
++ }
++
++#if 0
++ // Mildly rotted - we support no mode where cross is valid
++ if (lc->tu.cross_pf) {
++ int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
++ const int ccount = 1 << (log2_trafo_size * 2);
++
++ for (i = 0; i < ccount; i++) {
++ coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++ }
++ }
++#endif
++
++ if (!use_dc) {
++#if RPI_COMPRESS_COEFFS
++ if (use_compress) {
++ coeffs32[num_nonzero] = 0;
++ }
++#endif
++ rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
++ }
++}
++
++#if !USE_BY22
++// Stores results to lc
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++{
++ int x = abs_mvd_greater0_flag_decode(lc);
++ int y = abs_mvd_greater0_flag_decode(lc);
++
++ if (x)
++ x += abs_mvd_greater1_flag_decode(lc);
++ if (y)
++ y += abs_mvd_greater1_flag_decode(lc);
++
++ switch (x) {
++ case 2: x = mvd_decode(lc); break;
++ case 1: x = mvd_sign_flag_decode(lc); break;
++ case 0: x = 0; break;
++ }
++
++ switch (y) {
++ case 2: y = mvd_decode(lc); break;
++ case 1: y = mvd_sign_flag_decode(lc); break;
++ case 0: y = 0; break;
++ }
++ return MV_XY(x,y);
++}
++#else
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++{
++ int x = abs_mvd_greater0_flag_decode(lc);
++ int y = abs_mvd_greater0_flag_decode(lc);
++
++ if ((x | y) == 0)
++ return 0;
++
++ if (x != 0)
++ x += abs_mvd_greater1_flag_decode(lc);
++ if (y != 0)
++ y += abs_mvd_greater1_flag_decode(lc);
++
++ if ((x | y) == 1)
++ {
++ // Not worth starting BY22
++ if (x != 0)
++ x = mvd_sign_flag_decode(lc);
++ if (y != 0)
++ y = mvd_sign_flag_decode(lc);
++ }
++ else
++ {
++ CABACContext * const cc = &lc->cc;
++ uint32_t val;
++ uint32_t b;
++ unsigned int n = 0;
++
++ bypass_start(cc);
++ b = val = get_cabac_by22_peek(cc);
++
++ if (x == 1) {
++ x = ((int32_t)b >> 31) | 1;
++ n = 1;
++ b <<= 1;
++ }
++ else if (x == 2) {
++ // EG1 so we have (leading one bits + 1) of suffix
++ // This makes prefix & suffix lengths the same
++ const unsigned int k = hevc_clz32(~b) + 1;
++ int s;
++
++ av_assert2(k <= 15);
++
++ b <<= k;
++ n = 2 * k + 1; // Includes suffix & sign
++
++ // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
++ // if we are going to do this without a flush
++ if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
++ {
++ // Need too many bits - flush
++ // n = k
++ get_cabac_by22_flush(cc, k, val);
++ b = val = get_cabac_by22_peek(cc);
++ n = k + 1;
++ }
++
++ x = (b >> (32 - k)) + (1 << k);
++ b <<= k;
++ s = (int32_t)b >> 31;
++ x = (x ^ s) - s;
++ b <<= 1;
++
++ // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
++ if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
++ {
++ get_cabac_by22_flush(cc, n, val);
++ b = val = get_cabac_by22_peek(cc);
++ n = 0;
++ }
++ }
++
++ if (y == 1) {
++ y = ((int32_t)b >> 31) | 1;
++ ++n;
++ // don't care about b anymore
++ }
++ else if (y == 2) {
++ const unsigned int k = hevc_clz32(~b) + 1;
++ int s;
++
++ av_assert2(k <= 15);
++
++ // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
++ // if we are going to do this without a flush
++ b <<= k;
++ n += 2 * k + 1;
++
++ if (n > CABAC_BY22_PEEK_BITS)
++ {
++ // Need too many bits - flush
++ get_cabac_by22_flush(cc, n - (k + 1), val);
++ b = val = get_cabac_by22_peek(cc);
++ n = k + 1;
++ }
++
++ y = (b >> (32 - k)) + (1 << k);
++ s = (int32_t)(b << k) >> 31;
++ y = (y ^ s) - s;
++ // don't care about b anymore
++ }
++
++ get_cabac_by22_flush(cc, n, val);
++ bypass_finish(cc);
++ }
++
++ return MV_XY(x, y);
++}
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_hevc_cabac_fns.h
+@@ -0,0 +1,217 @@
++/*
++ * HEVC CABAC decoding
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2018 John Cox
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
++#define AVCODEC_RPI_HEVC_CABAC_FNS_H
++
++#include "config.h"
++#include "rpi_hevcdec.h"
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
++
++//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0,
++ const int log2_trafo_size, const enum ScanType scan_idx,
++ const int c_idx);
++
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
++
++#define HEVC_BIN_SAO_MERGE_FLAG 0
++#define HEVC_BIN_SAO_TYPE_IDX 1
++#define HEVC_BIN_SAO_EO_CLASS 2
++#define HEVC_BIN_SAO_BAND_POSITION 2
++#define HEVC_BIN_SAO_OFFSET_ABS 2
++#define HEVC_BIN_SAO_OFFSET_SIGN 2
++#define HEVC_BIN_END_OF_SLICE_FLAG 2
++#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG 2
++#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG 5
++#define HEVC_BIN_SKIP_FLAG 6
++#define HEVC_BIN_CU_QP_DELTA 9
++#define HEVC_BIN_PRED_MODE 12
++#define HEVC_BIN_PART_MODE 13
++#define HEVC_BIN_PCM_FLAG 17
++#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE 17
++#define HEVC_BIN_MPM_IDX 18
++#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE 18
++#define HEVC_BIN_INTRA_CHROMA_PRED_MODE 18
++#define HEVC_BIN_MERGE_FLAG 20
++#define HEVC_BIN_MERGE_IDX 21
++#define HEVC_BIN_INTER_PRED_IDC 22
++#define HEVC_BIN_REF_IDX_L0 27
++#define HEVC_BIN_REF_IDX_L1 29
++#define HEVC_BIN_ABS_MVD_GREATER0_FLAG 31
++#define HEVC_BIN_ABS_MVD_GREATER1_FLAG 33
++#define HEVC_BIN_ABS_MVD_MINUS2 35
++#define HEVC_BIN_MVD_SIGN_FLAG 35
++#define HEVC_BIN_MVP_LX_FLAG 35
++#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG 36
++#define HEVC_BIN_SPLIT_TRANSFORM_FLAG 37
++#define HEVC_BIN_CBF_LUMA 40
++#define HEVC_BIN_CBF_CB_CR 42
++#define HEVC_BIN_TRANSFORM_SKIP_FLAG 46
++#define HEVC_BIN_EXPLICIT_RDPCM_FLAG 48
++#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG 50
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX 52
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX 70
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX 88
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX 88
++#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG 88
++#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG 92
++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG 136
++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG 160
++#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING 166
++#define HEVC_BIN_COEFF_SIGN_FLAG 166
++#define HEVC_BIN_LOG2_RES_SCALE_ABS 166
++#define HEVC_BIN_RES_SCALE_SIGN_FLAG 174
++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG 176
++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX 177
++
++
++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
++
++static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
++ const uint8_t *ptr = c->bytestream;
++
++ if (c->low & 0x1)
++ ptr--;
++#if CABAC_BITS == 16
++ if (c->low & 0x1FF)
++ ptr--;
++#endif
++ if ((int) (c->bytestream_end - ptr) < n)
++ return NULL;
++ if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
++ return NULL;
++
++ return ptr;
++}
++
++static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
++}
++
++static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
++}
++
++static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
++}
++
++static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int ct_depth,
++ const unsigned int x0, const unsigned int y0)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
++ ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
++ ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
++}
++
++static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0, const int x_cb, const int y_cb)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
++ (s->cabac_stash_left[y0 >> 3] & 1) +
++ (s->cabac_stash_up[x0 >> 3] & 1));
++}
++
++static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
++}
++
++static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
++}
++
++static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
++}
++
++static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
++}
++
++static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
++}
++
++static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
++}
++
++static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
++}
++
++static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
++}
++
++static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
++}
++
++static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
++{
++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
++}
++
++
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.c
+@@ -0,0 +1,75 @@
++/*
++ * HEVC shared tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "rpi_hevc_data.h"
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
++ 0, 0, 1, 0,
++ 1, 2, 0, 1,
++ 2, 3, 1, 2,
++ 3, 2, 3, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
++ 0, 1, 0, 2,
++ 1, 0, 3, 2,
++ 1, 0, 3, 2,
++ 1, 3, 2, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
++ 0, 0, 1, 0,
++ 1, 2, 0, 1,
++ 2, 3, 0, 1,
++ 2, 3, 4, 0,
++ 1, 2, 3, 4,
++ 5, 0, 1, 2,
++ 3, 4, 5, 6,
++ 0, 1, 2, 3,
++ 4, 5, 6, 7,
++ 1, 2, 3, 4,
++ 5, 6, 7, 2,
++ 3, 4, 5, 6,
++ 7, 3, 4, 5,
++ 6, 7, 4, 5,
++ 6, 7, 5, 6,
++ 7, 6, 7, 7,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
++ 0, 1, 0, 2,
++ 1, 0, 3, 2,
++ 1, 0, 4, 3,
++ 2, 1, 0, 5,
++ 4, 3, 2, 1,
++ 0, 6, 5, 4,
++ 3, 2, 1, 0,
++ 7, 6, 5, 4,
++ 3, 2, 1, 0,
++ 7, 6, 5, 4,
++ 3, 2, 1, 7,
++ 6, 5, 4, 3,
++ 2, 7, 6, 5,
++ 4, 3, 7, 6,
++ 5, 4, 7, 6,
++ 5, 7, 6, 7,
++};
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.h
+@@ -0,0 +1,31 @@
++/*
++ * HEVC shared data tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_DATA_H
++#define AVCODEC_RPI_HEVC_DATA_H
++
++#include <stdint.h>
++
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
++
++#endif /* AVCODEC_RPI_HEVC_DATA_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevc_filter.c
+@@ -0,0 +1,1210 @@
++/*
++ * HEVC video decoder
++ *
++ * Originally by:
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Seppo Tomperi
++ * Copyright (C) 2013 Wassim Hamidouche
++ *
++ * Substantially rewritten:
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++//#define DISABLE_SAO
++//#define DISABLE_DEBLOCK
++//#define DISABLE_STRENGTHS
++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
++//#define DISABLE_DEBLOCK_NONREF
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++
++#include "rpi_qpu.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#define LUMA 0
++#define CB 1
++#define CR 2
++
++// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
++// so -12,75 overall
++static const uint8_t tctablex[] = {
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // QP 0...18
++ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // QP 19...37
++ 5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24, // QP 38...53
++ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 // 54..75
++};
++#define tctable (tctablex + 12 + 6*8)
++
++static const uint8_t betatablex[] = {
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1
++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, // QP 0...18
++ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
++ 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, // QP 38...51
++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 // 52..73
++};
++#define betatable (betatablex + 12 + 6*8)
++
++static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
++ const int c_idx, const int tc_offset)
++{
++ return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
++}
++
++static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int xBase, const unsigned int yBase)
++{
++ const unsigned int ctb_size_mask = (1 << s->ps.sps->log2_ctb_size) - 1;
++ const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
++ const unsigned int xQgBase = xBase & MinCuQpDeltaSizeMask;
++ const unsigned int yQgBase = yBase & MinCuQpDeltaSizeMask;
++ const unsigned int min_cb_width = s->ps.sps->min_cb_width;
++ const unsigned int x_cb = xQgBase >> s->ps.sps->log2_min_cb_size;
++ const unsigned int y_cb = yQgBase >> s->ps.sps->log2_min_cb_size;
++ const int qPy_pred = lc->qPy_pred;
++
++ return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
++ s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
++ ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
++ s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
++}
++
++// * Only called from bitstream decode in foreground
++// so should be safe
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
++{
++ const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
++
++ if (lc->tu.cu_qp_delta != 0) {
++ // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
++ int off = s->ps.sps->qp_bd_offset;
++ lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
++ 52 + off) - off;
++ } else
++ lc->qp_y = qp_y;
++}
++
++static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
++{
++ return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
++}
++
++// "DSP" these?
++static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
++{
++ switch (pixel_shift)
++ {
++ case 2:
++ *(uint32_t *)dst = *(uint32_t *)src;
++ break;
++ case 1:
++ *(uint16_t *)dst = *(uint16_t *)src;
++ break;
++ default:
++ *dst = *src;
++ break;
++ }
++}
++
++static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
++ ptrdiff_t stride_src, int x, int y, int width, int height,
++ int c_idx, int x_ctb, int y_ctb)
++{
++ const unsigned int sh = pixel_shift(s, c_idx);
++ const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
++ const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
++
++ /* copy horizontal edges */
++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
++ src, width << sh);
++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
++ src + stride_src * (height - 1), width << sh);
++
++ /* copy vertical edges */
++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
++
++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
++}
++
++// N.B. Src & dst are swapped as this is a restore!
++// x0 & y0 are in luma coords
++// Width & height are in Y/C pels as appropriate
++// * Clear scope for optimsation here but not used enough to be worth it
++static void restore_tqb_pixels(const HEVCRpiContext * const s,
++ uint8_t *src1, const uint8_t *dst1,
++ const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int width, const int height,
++ const int c_idx)
++{
++ if (s->ps.pps->transquant_bypass_enable_flag ||
++ s->ps.sps->pcm.loop_filter_disable_flag)
++ {
++ const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
++ int blks_y = height >> (c_idx == 0 ? 3 : 2);
++ const unsigned int bwidth = 8 << s->ps.sps->pixel_shift; // Y & C have the same width in sand
++ const unsigned int bheight = (c_idx == 0) ? 8 : 4;
++ const unsigned int sh = ((x0 >> 3) & 7);
++ const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
++
++ do {
++ unsigned int m = (*pcm >> sh) & mask;
++ uint8_t * bd = src1;
++ const uint8_t * bs = dst1;
++ while (m != 0) {
++ if ((m & 1) != 0) {
++ s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
++ }
++ m >>= 1;
++ bs += bwidth;
++ bd += bwidth;
++ }
++ src1 += stride_src * bheight;
++ dst1 += stride_dst * bheight;
++ pcm += s->ps.sps->pcm_width;
++ } while (--blks_y > 0);
++ }
++}
++
++#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
++
++static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
++{
++#if SAO_FILTER_N == 5
++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#elif SAO_FILTER_N == 6
++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#else
++#error Confused by size of sao fn array
++#endif
++ int c_idx;
++ int edges[4]; // 0 left 1 top 2 right 3 bottom
++ int x_ctb = x >> s->ps.sps->log2_ctb_size;
++ int y_ctb = y >> s->ps.sps->log2_ctb_size;
++ int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb;
++ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
++ RpiSAOParams *sao = &CTB(s->sao, x_ctb, y_ctb);
++ // flags indicating unfilterable edges
++ uint8_t vert_edge[] = { 0, 0 };
++ uint8_t horiz_edge[] = { 0, 0 };
++ uint8_t diag_edge[] = { 0, 0, 0, 0 };
++ uint8_t lfase = CTB(s->filter_slice_edges, x_ctb, y_ctb);
++ uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag &&
++ !s->ps.pps->loop_filter_across_tiles_enabled_flag;
++ uint8_t restore = no_tile_filter || !lfase;
++ uint8_t left_tile_edge = 0;
++ uint8_t right_tile_edge = 0;
++ uint8_t up_tile_edge = 0;
++ uint8_t bottom_tile_edge = 0;
++ const int sliced = 1;
++ const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
++
++ edges[0] = x_ctb == 0;
++ edges[1] = y_ctb == 0;
++ edges[2] = x_ctb == s->ps.sps->ctb_width - 1;
++ edges[3] = y_ctb == s->ps.sps->ctb_height - 1;
++
++#ifdef DISABLE_SAO
++ return;
++#endif
++
++ if (restore) {
++ if (!edges[0]) {
++ left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
++ vert_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
++ }
++ if (!edges[2]) {
++ right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
++ vert_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
++ }
++ if (!edges[1]) {
++ up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
++ horiz_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
++ }
++ if (!edges[3]) {
++ bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
++ horiz_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
++ }
++ if (!edges[0] && !edges[1]) {
++ diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
++ }
++ if (!edges[1] && !edges[2]) {
++ diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
++ }
++ if (!edges[2] && !edges[3]) {
++ diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
++ }
++ if (!edges[0] && !edges[3]) {
++ diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
++ }
++ }
++
++ for (c_idx = 0; c_idx < plane_count; c_idx++) {
++ const unsigned int vshift = ctx_vshift(s, c_idx);
++ const unsigned int hshift = ctx_hshift(s, c_idx);
++ const int x0 = x >> hshift;
++ const int y0 = y >> vshift;
++ const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
++ const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
++ const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
++ const int width = FFMIN(ctb_size_h, (s->ps.sps->width >> hshift) - x0);
++ const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
++ int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
++ ptrdiff_t stride_dst;
++ uint8_t *dst;
++
++ const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
++ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++ uint8_t * const src = !sliced ?
++ &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
++ av_rpi_sand_frame_pos_c(s->frame, x0, y0);
++ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
++ !sliced ? src - (1 << sh) :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
++ av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
++ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
++ !sliced ? src + (width << sh) :
++ c_idx == 0 ?
++ av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
++ av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
++
++ if (sliced && c_idx > 1) {
++ break;
++ }
++
++// if (c_idx == 1)
++// printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
++
++ switch (sao->type_idx[c_idx]) {
++ case SAO_BAND:
++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++ x_ctb, y_ctb);
++ if (s->ps.pps->transquant_bypass_enable_flag ||
++ s->ps.sps->pcm.loop_filter_disable_flag)
++ {
++ // Can't use the edge buffer here as it may be in use by the foreground
++ DECLARE_ALIGNED(64, uint8_t, dstbuf)
++ [2*MAX_PB_SIZE*MAX_PB_SIZE];
++ dst = dstbuf;
++ stride_dst = 2*MAX_PB_SIZE;
++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
++ if (sliced && c_idx != 0)
++ {
++ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
++ sao->offset_val[1], sao->band_position[1],
++ sao->offset_val[2], sao->band_position[2],
++ width, height);
++ }
++ else
++ {
++ s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++ sao->offset_val[c_idx], sao->band_position[c_idx],
++ width, height);
++ }
++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++ x, y, width, height, c_idx);
++ } else {
++ if (sliced && c_idx != 0)
++ {
++ s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
++ sao->offset_val[1], sao->band_position[1],
++ sao->offset_val[2], sao->band_position[2],
++ width, height);
++ }
++ else
++ {
++ s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++ sao->offset_val[c_idx], sao->band_position[c_idx],
++ width, height);
++ }
++ }
++ sao->type_idx[c_idx] = SAO_APPLIED;
++ break;
++ case SAO_EDGE:
++ {
++ const int w = s->ps.sps->width >> hshift;
++ const int h = s->ps.sps->height >> vshift;
++ int top_edge = edges[1];
++ int bottom_edge = edges[3];
++ // Can't use the edge buffer here as it may be in use by the foreground
++ DECLARE_ALIGNED(64, uint8_t, dstbuf)
++ [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
++
++ stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
++ dst = dstbuf + stride_dst + 32;
++
++ if (!top_edge) {
++ uint8_t *dst1;
++ int src_idx;
++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
++
++ dst1 = dst - stride_dst;
++
++ if (src_l != NULL) {
++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
++ }
++
++ src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
++
++ if (src_r != NULL) {
++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
++ }
++ }
++ if (!bottom_edge) {
++ uint8_t * const dst1 = dst + height * stride_dst;
++ int src_idx;
++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
++ const unsigned int hoff = height * stride_src;
++
++ if (src_l != NULL) {
++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
++ }
++
++ src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
++
++ if (src_r != NULL) {
++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
++ SAO_APPLIED);
++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
++ }
++ }
++ if (src_l != NULL) {
++ if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++ ff_hevc_rpi_copy_vert(dst - (1 << sh),
++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
++ sh, height, stride_dst, 1 << sh);
++ } else {
++ ff_hevc_rpi_copy_vert(dst - (1 << sh),
++ src_l,
++ sh, height, stride_dst, stride_src);
++ }
++ }
++ if (src_r != NULL) {
++ if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++ ff_hevc_rpi_copy_vert(dst + (width << sh),
++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
++ sh, height, stride_dst, 1 << sh);
++ } else {
++ ff_hevc_rpi_copy_vert(dst + (width << sh),
++ src_r,
++ sh, height, stride_dst, stride_src);
++ }
++ }
++
++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
++
++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++ x_ctb, y_ctb);
++ if (sliced && c_idx != 0)
++ {
++ // Class always the same for both U & V (which is just as well :-))
++ s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
++ sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
++ width, height);
++ s->hevcdsp.sao_edge_restore_c[restore](src, dst,
++ stride_src, stride_dst,
++ sao,
++ edges, width,
++ height, c_idx,
++ vert_edge,
++ horiz_edge,
++ diag_edge);
++ }
++ else
++ {
++ s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++ sao->eo_class[c_idx], width, height);
++ s->hevcdsp.sao_edge_restore[restore](src, dst,
++ stride_src, stride_dst,
++ sao,
++ edges, width,
++ height, c_idx,
++ vert_edge,
++ horiz_edge,
++ diag_edge);
++ }
++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++ x, y, width, height, c_idx);
++ sao->type_idx[c_idx] = SAO_APPLIED;
++ break;
++ }
++ }
++ }
++
++#if RPI_ZC_SAND_8_IN_10_BUF
++ if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
++ (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
++ {
++ const unsigned int stride1 = frame_stride1(s->frame, 1);
++ const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
++ const unsigned int xoff = (x >> 8) * stride2 * stride1;
++ const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++ const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
++ uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
++ const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
++ uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
++ const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
++ const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
++
++// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
++ av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
++ av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
++ }
++#endif
++}
++
++// When bits are delivered to deblock we want them
++//#define TL 1
++//#define TR 2
++//#define BL 4
++//#define BR 8
++
++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
++// so we need to rearrange before passing on
++
++static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
++ return (pcm[0] |
++ (pcm[1] << 8) |
++ (pcm[s->ps.sps->pcm_width] << 16) |
++ (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
++}
++
++static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
++ return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
++}
++
++// We cast away const here as we want this to work for both get and set
++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
++{
++ return (uint32_t *)(bs +
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#warning Unexpected masks
++ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
++
++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
++{
++ return (uint8_t *)(bs +
++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
++
++
++// Get block strength
++// Given how we call we will always get within the 32bit boundries
++static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
++ unsigned int xl, unsigned int xr, const unsigned int y)
++{
++ if (xr <= xl) {
++ return 0;
++ }
++ else
++ {
++#if HAVE_ARMV6T2_INLINE
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#error This case not yet handled in bs_get32
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++ uint32_t tmp;
++ __asm__ (
++ "lsr %[tmp], %[xl], %[xl_shift] \n\t"
++ "rsb %[xr], %[xl], %[xr] \n\t"
++ "mla %[stride2], %[stride2], %[tmp], %[bs] \n\t"
++ "add %[xr], %[xr], #7 \n\t"
++ "lsr %[bs], %[y], %[y_shift1] \n\t"
++ "bic %[xr], %[xr], #7 \n\t"
++ "ubfx %[xl], %[xl], #1, #5 \n\t"
++ "lsr %[xr], %[xr], #1 \n\t"
++ "cmp %[xr], #32 \n\t"
++ "mvn %[tmp], #0 \n\t"
++ "ldr %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
++ "lsl %[tmp], %[tmp], %[xr] \n\t"
++ "lsr %[xl], %[bs], %[xl] \n\t"
++ "it ne \n\t"
++ "bicne %[bs], %[xl], %[tmp] \n\t"
++ : // Outputs
++ [bs]"+r"(bs),
++ [stride2]"+r"(stride2),
++ [xl]"+r"(xl),
++ [xr]"+r"(xr),
++ [tmp]"=&r"(tmp)
++ : // Inputs
++ [y]"r"(y),
++ [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
++ [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
++ [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++ : // Clobbers
++ "cc"
++ );
++ return (uint32_t) bs;
++#else
++ const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
++ const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
++
++ return n == 32 ? a :
++ (a >> ((xl >> 1) & 31)) & ~(~0U << n);
++#endif
++ }
++}
++
++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
++}
++
++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
++}
++
++
++static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
++{
++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ const unsigned int ctb_size = (1 << log2_ctb_size);
++ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 1);
++ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
++ const DBParams * cb_dbp = s->deblock + ctb_n;
++ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
++
++ unsigned int cb_x;
++
++ // Do in CTB-shaped blocks
++ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
++ {
++ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
++ const unsigned int bv_l = FFMAX(cb_x, 8);
++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
++ const unsigned int bh_l = bv_l - 8;
++ unsigned int y;
++
++ // Main body
++ for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
++ {
++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
++
++ const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
++ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++
++ if (vbs != 0)
++ {
++ const uint8_t * const tcv = tctable + dbp->tc_offset;
++ const uint8_t * const betav = betatable + dbp->beta_offset;
++ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++ unsigned int x;
++
++ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
++ {
++ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
++ {
++ const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betav[qp],
++ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
++ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
++ pcmfa & 3,
++ av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
++ }
++ }
++ }
++
++ if (y != 0)
++ {
++ uint32_t hbs;
++
++ // H left - mostly separated out so we only need a uint32_t hbs
++ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
++ {
++ const unsigned int x = bh_l;
++ const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const DBParams * const dbph = dbp - 1;
++ const uint8_t * const tc = tctable + dbph->tc_offset + qp;
++
++ av_assert2(cb_x - bh_l == 8);
++
++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betatable[qp + dbph->beta_offset],
++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++ }
++
++ // H
++ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop
++ {
++ unsigned int x;
++ unsigned int pcmfa = pcm4(s, cb_x, y - 1);
++
++ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
++ {
++ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
++ {
++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + dbp->tc_offset + qp;
++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++ frame_stride1(s->frame, LUMA),
++ betatable[qp + dbp->beta_offset],
++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++ }
++ }
++ }
++ }
++
++ }
++ }
++}
++
++static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++ return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
++}
++
++static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
++{
++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ const unsigned int ctb_size = (1 << log2_ctb_size);
++ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 8);
++ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
++ const DBParams * dbp = s->deblock + ctb_n;
++ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
++ const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
++ const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
++
++ unsigned int cb_x;
++
++ av_assert1((bounds.x & (ctb_size - 1)) == 0);
++ av_assert1((bounds.y & (ctb_size - 1)) == 0);
++ av_assert1(bounds.h <= ctb_size);
++
++ // Do in CTB-shaped blocks
++ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
++ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
++ const unsigned int bv_l = FFMAX(cb_x, 16);
++ unsigned int y;
++
++ // V above
++ if (bounds.y != 0) {
++ // Deblock V up 8
++ // CTB above current
++ // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
++ const unsigned int y = bounds.y - 8;
++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
++
++ if (vbs != 0)
++ {
++ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
++ unsigned int x;
++
++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
++ {
++ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
++ {
++ const int qp0 = q2h(s, x, y);
++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++ pcmfa & 3);
++ }
++ }
++ }
++ }
++
++ for (y = bounds.y; y < b_b; y += 16)
++ {
++ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
++ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
++
++ // V
++ if (vbs != 0)
++ {
++ unsigned int x;
++ unsigned int pcmfa =
++ (y + 16 > b_b ?
++ pcm2(s, bv_l - 1, y) | 0xffff0000 :
++ pcm4(s, bv_l - 1, y));
++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
++ {
++ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++ {
++ const int qp0 = q2h(s, x, y);
++ const int qp1 = q2h(s, x, y + 8);
++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
++ }
++ }
++
++ // H
++ if (y != 0)
++ {
++ uint32_t hbs;
++ const unsigned int bh_l = bv_l - 16;
++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
++ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++
++ // H left - mostly separated out so we only need a uint32_t hbs
++ // Stub is width 8 to the left of bounds, but width 16 internally
++ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
++ {
++ unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++
++ // Chop off bits we don't want...
++ if (bh_l < bounds.x) {
++ pcmfa |= 0x10001; // TL|BL pre rearrangement
++ hbs &= ~3; // Make BS 0
++ }
++
++ // Double check we still want this
++ if (hbs != 0 && (~pcmfa & 0x30003) != 0)
++ {
++ const unsigned int x = bh_l;
++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
++
++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
++ }
++
++ // H main
++ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
++ {
++ unsigned int x;
++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it
++
++ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
++ {
++ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++ {
++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++ frame_stride1(s->frame, 1),
++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++ }
++ }
++ }
++ }
++ }
++ }
++}
++
++static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
++{
++ return x & ~(~0U << log2_n);
++}
++
++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++{
++ av_assert2((y & 7) == 0);
++
++ // This doesn't have the same simultainious update issues that bsf_stash
++ // does (other threads will have a different y) so we can do it the easy way
++ if ((bsf &= mask) != 0)
++ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
++}
++
++
++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++{
++ // We arrange this in a slightly odd fashion but it lines up with
++ // how we are going to use it in the actual deblock code & it is easier
++ // to do the contortions here than there
++ //
++ // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
++
++ av_assert2((x & 7) == 0);
++
++ if ((bsf &= mask) != 0)
++ {
++ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
++ const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
++
++ if (mask <= 0xf)
++ {
++ *p |= (bsf << sh);
++ }
++ else
++ {
++ do {
++ *p |= (bsf & 0xf) << sh;
++ p += HEVC_RPI_BS_STRIDE1_BYTES;
++ } while ((bsf >>= 4) != 0);
++ }
++ }
++}
++
++static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
++ const unsigned int rep, const unsigned int dup,
++ const unsigned int mvf_stride0,
++ const unsigned int mvf_stride1,
++ const RefPicList * const rpl_p, const RefPicList * const rpl_q,
++ const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
++{
++ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
++ mvf_p, mvf_q,
++ rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
++ sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
++}
++
++
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
++ const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_trafo_size,
++ const int is_coded_block)
++{
++ const HEVCRpiMvField * const mvf_curr = mvf_stash_ptr(s, lc, x0, y0);
++ const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
++ const RefPicList * const rpl = s->refPicList;
++ // Rep count for bsf_mv when running with min_pu chuncks
++ const unsigned int log2_rep_min_pu = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
++ const unsigned int boundary_flags = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
++ const unsigned int trafo_size = (1U << log2_trafo_size);
++ const uint32_t bsf_mask = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
++ const uint32_t bsf_cbf = (bsf_mask & 0x55555555);
++
++ // Do we cover a pred split line?
++ const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
++ const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
++
++ uint32_t bsf_h;
++ uint32_t bsf_v;
++
++#ifdef DISABLE_STRENGTHS
++ return;
++#endif
++
++ // We are always on a size boundary
++ av_assert2((x0 & (trafo_size - 1)) == 0);
++ av_assert2((y0 & (trafo_size - 1)) == 0);
++ // log2_trafo_size not really a transform size; we can have to deal
++ // with size 2^6 blocks
++ av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
++
++ // Retrieve and update coded (b0), intra (b1) bs flags
++ //
++ // Store on min width (rather than uint32_t) to avoid possible issues
++ // with another thread on another core running wpp using the same
++ // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
++ //
++ // In bsf BS=2 is represented by 3 as it is much easier to test & set
++ // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
++ // 3 will work the same
++ {
++ // Given where we are called from is_cbf_luma & is_intra will be constant over the block
++ const uint32_t bsf0 = (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
++ uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
++ uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
++
++ switch (log2_trafo_size)
++ {
++ case 2:
++ case 3:
++ {
++ const unsigned int sh_h = (x0 >> 1) & 7;
++ const unsigned int sh_v = (y0 >> 1) & 7;
++ bsf_h = *p;
++ bsf_v = *q;
++ *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
++ *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
++ bsf_h >>= sh_h;
++ bsf_v >>= sh_v;
++ break;
++ }
++ case 4:
++ bsf_h = *p;
++ bsf_v = *q;
++ *p = bsf0;
++ *q = bsf0;
++ break;
++ case 5:
++ bsf_h = *(uint16_t *)p;
++ bsf_v = *(uint16_t *)q;
++ *(uint16_t *)p = bsf0;
++ *(uint16_t *)q = bsf0;
++ break;
++ case 6:
++ default:
++ bsf_h = *(uint32_t *)p;
++ bsf_v = *(uint32_t *)q;
++ *(uint32_t *)p = bsf0;
++ *(uint32_t *)q = bsf0;
++ break;
++ }
++
++ bsf_h |= bsf0;
++ bsf_v |= bsf0;
++ }
++
++ // Do Horizontal
++ if ((y0 & 7) == 0)
++ {
++ // Boundary upper
++ if (y0 != 0 &&
++ (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
++ (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
++ {
++ // Look at MVs (BS=1) if we don't already has a full set of bs bits
++ if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
++ {
++ // If we aren't on the top boundary we must be in the middle
++ // and in that case we know where mvf can change
++ const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
++ const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
++ s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
++ rpl;
++
++ bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++ trafo_size >> (log2_min_pu_size + log2_rep),
++ trafo_size >> (log2_min_pu_size + log2_rep),
++ rpl, rpl_top,
++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
++ }
++
++ // Finally put the results into bs
++ hbs_set(s, x0, y0, bsf_mask, bsf_h);
++ }
++
++ // Max of 1 pu internal split - ignore if not on 8pel boundary
++ if (has_y_split && !off_boundary(lc->cu.y_split, 3))
++ {
++ const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
++ // If we have the x split as well then it must be in the middle
++ const unsigned int log2_rep = has_x_split ? 1 : 0;
++
++ hbs_set(s, x0, lc->cu.y_split, bsf_mask,
++ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++ trafo_size >> (log2_min_pu_size + log2_rep),
++ trafo_size >> (log2_min_pu_size + log2_rep),
++ rpl, rpl,
++ mvf, mvf - MVF_STASH_WIDTH_PU));
++ }
++ }
++
++ // And again for vertical - same logic as horizontal just in the other direction
++ if ((x0 & 7) == 0)
++ {
++ // Boundary left
++ if (x0 != 0 &&
++ (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
++ (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
++ {
++ if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
++ {
++ const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
++ const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
++ s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
++ rpl;
++
++ bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++ (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++ rpl, rpl_left,
++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
++ }
++
++ vbs_set(s, x0, y0, bsf_mask, bsf_v);
++ }
++
++ if (has_x_split && !off_boundary(lc->cu.x_split, 3))
++ {
++ const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
++ const unsigned int log2_rep = has_y_split ? 1 : 0;
++
++ vbs_set(s, lc->cu.x_split, y0, bsf_mask,
++ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++ rpl, rpl,
++ mvf, mvf - 1));
++ }
++ }
++}
++
++#undef LUMA
++#undef CB
++#undef CR
++
++static inline unsigned int ussub(const unsigned int a, const unsigned int b)
++{
++ return a < b ? 0 : a - b;
++}
++
++static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
++{
++ return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
++}
++
++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
++{
++ const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++ int x, y;
++
++ const unsigned int br = bounds.x + bounds.w;
++ const unsigned int bb = bounds.y + bounds.h;
++
++ const int x_end = (br >= s->ps.sps->width);
++ const int y_end = (bb >= s->ps.sps->height);
++
++ // Deblock may not touch the edges of the bound as they are still needed
++ // for Intra pred
++ //
++ // Deblock is disabled with a per-slice flag
++ // Given that bounds may cover multiple slices & we dblock outside bounds
++ // anyway we can't avoid deblock using that flag - about the only thing we
++ // could do is have a "no deblock seen yet" flag but it doesn't really
++ // seem worth the effort
++
++ deblock_y_blk(s, bounds, x_end, y_end);
++ deblock_uv_blk(s, bounds, x_end, y_end);
++
++ // SAO needs
++ // (a) CTB alignment
++ // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
++ {
++ const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
++ const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
++ const unsigned int yt = ussub(bounds.y, yo);
++ const unsigned int yb = y_end ? bb : ussub(bb, yo);
++ const unsigned int xl = ussub(bounds.x, xo);
++ const unsigned int xr = x_end ? br : ussub(br, xo);
++
++ if (s->ps.sps->sao_enabled)
++ {
++ for (y = yt; y < yb; y += ctb_size) {
++ for (x = xl; x < xr; x += ctb_size) {
++ sao_filter_CTB(s, x, y);
++ }
++ }
++ }
++
++ // Cache invalidate
++ y = 0;
++ if (xr != 0 && yb != 0)
++ {
++ const unsigned int llen =
++ (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
++ const unsigned int mask = ~(llen - 1);
++ const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
++ const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
++ const unsigned int it = ussub(yt, 1);
++ const unsigned int ib = y_end ? bb : yb - 1;
++
++ if (il < ir) {
++ rpi_cache_buf_t cbuf;
++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++ il, it, ir - il, ib - it,
++ ctx_vshift(s, 1), 1, 1);
++
++ // If we have to commit the right hand tile boundry due to
++ // cache boundry considerations then at EoTile we must commit
++ // that boundry to bottom of tile (bounds)
++ if (ib != bb && ir == br && eot) {
++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++ br - 1, ib, 1, bb - ib,
++ ctx_vshift(s, 1), 1, 1);
++ }
++
++ rpi_cache_flush_finish(rfe);
++
++ if (x_end)
++ y = y_end ? INT_MAX : ib;
++
++// printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
++ }
++ }
++ }
++
++ return y;
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mv.h
+@@ -0,0 +1,71 @@
++#ifndef AVCODEC_RPI_HEVC_MV_H
++#define AVCODEC_RPI_HEVC_MV_H
++
++#include "config.h"
++
++typedef int32_t MvXY;
++
++typedef struct HEVCRpiMvField {
++ MvXY xy[2];
++ int8_t ref_idx[2];
++ int8_t pred_flag;
++ int8_t dummy; // To 12 bytes
++} HEVCRpiMvField;
++
++
++#define MV_X(xy) (((xy) << 16) >> 16)
++#define MV_Y(xy) ((xy) >> 16)
++#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_mv_arm.h"
++#endif
++
++#ifndef mvxy_add
++static inline MvXY mvxy_add(const MvXY a, const MvXY b)
++{
++ return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
++}
++#endif
++
++
++#ifndef mv_scale_xy
++static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
++{
++ int tx, scale_factor;
++
++ td = td == 0 ? 1 : av_clip_int8(td);
++ tb = av_clip_int8(tb);
++ tx = (0x4000 + (abs(td) >> 1)) / td;
++ scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
++ return MV_XY(
++ av_clip_int16((scale_factor * MV_X(src) + 127 +
++ (scale_factor * MV_X(src) < 0)) >> 8),
++ av_clip_int16((scale_factor * MV_Y(src) + 127 +
++ (scale_factor * MV_Y(src) < 0)) >> 8));
++}
++#endif
++
++// 8.3.1 states that the bitstream may not contain poc diffs that do not
++// fit in 16 bits, so given that we don't care about the high bits we only
++// store the low 16 + LT & Inter flags
++
++#define COL_POC_INTRA 0
++#define COL_POC_INTER (1 << 16)
++#define COL_POC_LT (1 << 17)
++#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
++#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
++#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
++
++typedef struct ColMv_s {
++ int32_t poc;
++ int32_t xy;
++} ColMv;
++
++typedef struct ColMvField_s {
++ ColMv L[2];
++} ColMvField;
++
++
++
++#endif // AVCODEC_RPI_HEVC_MV_H
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mvs.c
+@@ -0,0 +1,487 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Anand Meher Kotra
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++static av_always_inline int
++is_eq_mer(const unsigned int plevel,
++ const unsigned int xN, const unsigned int yN,
++ const unsigned int xP, const unsigned int yP)
++{
++ return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
++}
++
++// check if the mv's and refidx are the same between A and B
++static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
++{
++ return a->pred_flag == b->pred_flag &&
++ ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
++ ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
++ return 0;
++}
++
++/*
++ * 8.5.3.1.7 temporal luma motion vector prediction
++ */
++static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
++ const HEVCRpiLocalContext * const lc, const int x0, const int y0,
++ const int nPbW, const int nPbH, const int refIdxLx,
++ MvXY * const mvLXCol, const int X)
++{
++ int x, y;
++ const ColMv * cmv = NULL;
++
++ HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
++ const RefPicList * const refPicList = s->refPicList + X;
++ const int cur_lt = refPicList->isLongTerm[refIdxLx];
++
++ *mvLXCol = 0;
++ // Unlikely but we might have a col_ref IDR frame!
++ if (col_ref->col_mvf == NULL)
++ return 0;
++
++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
++
++ //bottom right collocated motion vector
++ x = x0 + nPbW;
++ y = y0 + nPbH;
++
++ if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++ y < s->ps.sps->height &&
++ x < s->ps.sps->width)
++ {
++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++ (y >> 4) * s->col_mvf_stride;
++
++ if (col->L[0].poc != COL_POC_INTRA &&
++ (col->L[1].poc == COL_POC_INTRA ||
++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++ {
++ cmv = col->L + 0;
++ }
++ else if (col->L[1].poc != COL_POC_INTRA)
++ {
++ cmv = col->L + 1;
++ }
++ }
++
++ // derive center collocated motion vector
++ if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
++ {
++ cmv = NULL;
++ x = x0 + (nPbW >> 1);
++ y = y0 + (nPbH >> 1);
++
++ {
++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++ (y >> 4) * s->col_mvf_stride;
++
++ if (col->L[0].poc != COL_POC_INTRA &&
++ (col->L[1].poc == COL_POC_INTRA ||
++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++ {
++ cmv = col->L + 0;
++ }
++ else if (col->L[1].poc != COL_POC_INTRA)
++ {
++ cmv = col->L + 1;
++ }
++ }
++ }
++
++ if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
++ return 0;
++
++ {
++ const int col_poc = col_ref->poc;
++ const int ref_poc = refPicList->list[refIdxLx];
++
++ *mvLXCol = (cur_lt ||
++ cmv->poc == col_poc ||
++ COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
++ cmv->xy :
++ mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
++ }
++
++ return cmv != NULL;
++}
++
++static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
++{
++ return b != NULL && compare_mv_ref_idx(a, b);
++}
++
++
++
++/*
++ * 8.5.3.1.2 Derivation process for spatial merging candidates
++ */
++static inline const HEVCRpiMvField *
++derive_spatial_merge_candidates(
++ const HEVCRpiContext * const s,
++ const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int nPbW, const unsigned int nPbH,
++ const unsigned int avail,
++ const unsigned int part_idx,
++ const unsigned int merge_idx,
++ HEVCRpiMvField * const mvf_t)
++{
++ const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
++ const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
++
++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++ const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
++ const unsigned int part_mode = lc->cu.part_mode;
++
++ const HEVCRpiMvField * perm[4];
++ unsigned int nb_merge_cand = 0;
++
++ // singleMCLFlag => part_idx == 0 so no need to test for it
++ if ((avail & AVAIL_L) == 0 ||
++ (part_idx == 1 &&
++ ((parts_a1 >> part_mode) & 1) != 0 ||
++ is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
++ mvf_a1->pred_flag == PF_INTRA)
++ {
++ mvf_a1 = NULL;
++ }
++ else
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_a1;
++ perm[nb_merge_cand++] = mvf_a1;
++ }
++
++ if ((avail & AVAIL_U) == 0 ||
++ (part_idx == 1 &&
++ ((parts_b1 >> part_mode) & 1) != 0 ||
++ is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
++ mvf_b1->pred_flag == PF_INTRA)
++ {
++ mvf_b1 = NULL;
++ }
++ else if (!mvf_eq(mvf_b1, mvf_a1))
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_b1;
++ perm[nb_merge_cand++] = mvf_b1;
++ }
++
++ // above right spatial merge candidate
++ // Never need mvf_b0 again so don't bother zeroing if navail
++ if ((avail & AVAIL_UR) != 0 &&
++ !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
++ mvf_b0->pred_flag != PF_INTRA &&
++ !mvf_eq(mvf_b0, mvf_b1))
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_b0;
++ perm[nb_merge_cand++] = mvf_b0;
++ }
++
++ // left bottom spatial merge candidate
++ // Never need mvf_a0 again so don't bother zeroing if navail
++ if ((avail & AVAIL_DL) != 0 &&
++ !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
++ mvf_a0->pred_flag != PF_INTRA &&
++ !mvf_eq(mvf_a0, mvf_a1))
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_a0;
++ perm[nb_merge_cand++] = mvf_a0;
++ }
++
++ // above left spatial merge candidate
++ if (nb_merge_cand != 4 &&
++ (avail & AVAIL_UL) != 0 &&
++ !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
++ {
++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL
++
++ if (mvf_b2->pred_flag != PF_INTRA &&
++ !mvf_eq(mvf_b2, mvf_a1) &&
++ !mvf_eq(mvf_b2, mvf_b1))
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_b2;
++ perm[nb_merge_cand++] = mvf_b2;
++ }
++ }
++
++ // temporal motion vector candidate
++ if (s->sh.slice_temporal_mvp_enabled_flag)
++ {
++ static const HEVCRpiMvField mvf_z = {{0}};
++
++ *mvf_t = mvf_z;
++
++ if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++ 0, mvf_t->xy + 0, 0))
++ mvf_t->pred_flag = PF_L0;
++
++ if (s->sh.slice_type == HEVC_SLICE_B &&
++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++ 0, mvf_t->xy + 1, 1))
++ mvf_t->pred_flag |= PF_L1;
++
++ if (mvf_t->pred_flag != 0)
++ {
++ if (merge_idx == nb_merge_cand)
++ return mvf_t;
++ perm[nb_merge_cand++] = mvf_t;
++ }
++ }
++
++ // combined bi-predictive merge candidates (applies for B slices)
++ if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
++ {
++ unsigned int comb_idx = 0;
++ const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
++ const RefPicList * const refPicList = s->refPicList;
++
++ for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
++ {
++ static const uint8_t l0_l1_cand_idx[12][2] = {
++ { 0, 1, },
++ { 1, 0, },
++ { 0, 2, },
++ { 2, 0, },
++ { 1, 2, },
++ { 2, 1, },
++ { 0, 3, },
++ { 3, 0, },
++ { 1, 3, },
++ { 3, 1, },
++ { 2, 3, },
++ { 3, 2, },
++ };
++
++ const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
++ const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
++ const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
++ const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
++
++ if ((mvf_c0->pred_flag & PF_L0) != 0 &&
++ (mvf_c1->pred_flag & PF_L1) != 0 &&
++ (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
++ mvf_c0->xy[0] != mvf_c1->xy[1]))
++ {
++ if (merge_idx == nb_merge_cand++)
++ {
++ // Need to be a bit careful as we will construct mvf_t and we
++ // may already be using that as one of our condidates
++ // so build & copy rather than build in place
++ const HEVCRpiMvField mvf_m = {
++ .xy = {
++ mvf_c0->xy[0],
++ mvf_c1->xy[1]},
++ .ref_idx = {
++ mvf_c0->ref_idx[0],
++ mvf_c1->ref_idx[1]},
++ .pred_flag = PF_BI
++ };
++ *mvf_t = mvf_m;
++ return mvf_t;
++ }
++ }
++ }
++ }
++
++ // "append" Zero motion vector candidates
++ {
++ const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
++ FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
++ const unsigned int zero_idx = merge_idx - nb_merge_cand;
++
++ const HEVCRpiMvField mvf_m = {
++ .xy = {0, 0},
++ .ref_idx = {
++ zero_idx < nb_refs ? zero_idx : 0,
++ (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
++ .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
++ };
++
++ *mvf_t = mvf_m;
++ return mvf_t;
++ }
++}
++
++
++// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++ int nPbH, int log2_cb_size, int part_idx,
++ int merge_idx, HEVCRpiMvField * const mv)
++{
++ const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
++ derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
++ ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
++ 0, merge_idx, mv) :
++ derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
++ part_idx, merge_idx, mv);
++
++ if (mvf_m != mv)
++ *mv = *mvf_m;
++
++ if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
++ mv->pred_flag = PF_L0;
++}
++
++
++static av_always_inline const MvXY *
++mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
++{
++ if (mvf != NULL)
++ {
++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
++ return mvf->xy + pfi0;
++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
++ return mvf->xy + pfi1;
++ }
++ return NULL;
++}
++
++static av_always_inline const MvXY *
++mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
++ const int islt0, const int poc0, const int poc_cur,
++ MvXY * const mv_t, const HEVCRpiMvField * const mvf)
++{
++ if (mvf != NULL)
++ {
++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
++ {
++ const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
++ if (islt0 || poc1 == poc0) {
++ return mvf->xy + pfi0;
++ }
++ *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
++ return mv_t;
++ }
++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
++ {
++ const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
++ if (islt0 || poc1 == poc0) {
++ return mvf->xy + pfi1;
++ }
++ *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
++ return mv_t;
++ }
++ }
++ return NULL;
++}
++
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int nPbW, const unsigned int nPbH,
++ const unsigned int avail,
++ HEVCRpiMvField * const mv,
++ const unsigned int mvp_lx_flag, const unsigned int LX)
++{
++ const unsigned int pfi0 = LX;
++ const unsigned int pfi1 = LX == 0 ? 1 : 0;
++ const RefPicList * const rpl = s->refPicList;
++ const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
++ const int poc_cur = s->poc;
++ const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
++
++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL
++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++ const MvXY * mva = NULL;
++ const MvXY * mvb;
++ MvXY * const mv_rv = mv->xy + LX;
++ MvXY mvt_a, mvt_b;
++
++ *mv_rv = 0;
++
++ if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
++ mvf_a0 = NULL;
++ else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
++ goto use_mva;
++
++ if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
++ mvf_a1 = NULL;
++
++ if (mva == NULL &&
++ (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
++ (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
++ mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
++
++ if (mvp_lx_flag == 0 && mva != NULL)
++ goto use_mva;
++
++ if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
++ mvf_b0 = NULL;
++ if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
++ mvf_b1 = NULL;
++ if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
++ mvf_b2 = NULL;
++
++ if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
++ (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
++ mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
++
++ if (mvf_a0 == NULL && mvf_a1 == NULL) {
++ mva = mvb;
++ if (mvp_lx_flag == 0 && mva != NULL)
++ goto use_mva;
++
++ if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
++ (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
++ mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
++ }
++
++ if (mva == NULL) {
++ mva = mvb;
++ mvb = NULL;
++ }
++
++ if (mvb != NULL && *mva == *mvb) // If A == B then ignore B
++ mvb = NULL;
++
++ if (mvp_lx_flag == 0 && mva != NULL) {
++ goto use_mva;
++ }
++ else if (mvp_lx_flag != 0 && mvb != NULL) {
++ *mv_rv = *mvb;
++ }
++ else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
++ nPbH, mv->ref_idx[LX],
++ mv_rv, LX);
++ }
++ return;
++
++use_mva:
++ *mv_rv = *mva;
++ return;
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.c
+@@ -0,0 +1,143 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "bytestream.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_parse.h"
++
++static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
++ HEVCSEIContext *sei, int is_nalff, int nal_length_size,
++ int err_recognition, int apply_defdispwin, void *logctx)
++{
++ int i;
++ int ret = 0;
++ H2645Packet pkt = { 0 };
++
++ ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff,
++ nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
++ if (ret < 0) {
++ goto done;
++ }
++
++ for (i = 0; i < pkt.nb_nals; i++) {
++ H2645NAL *nal = &pkt.nals[i];
++
++ /* ignore everything except parameter sets and VCL NALUs */
++ switch (nal->type) {
++ case HEVC_NAL_VPS:
++ ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
++ if (ret < 0)
++ goto done;
++ break;
++ case HEVC_NAL_SPS:
++ ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
++ if (ret < 0)
++ goto done;
++ break;
++ case HEVC_NAL_PPS:
++ ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
++ if (ret < 0)
++ goto done;
++ break;
++ case HEVC_NAL_SEI_PREFIX:
++ case HEVC_NAL_SEI_SUFFIX:
++ ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
++ if (ret < 0)
++ goto done;
++ break;
++ default:
++ av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
++ break;
++ }
++ }
++
++done:
++ ff_h2645_packet_uninit(&pkt);
++ if (err_recognition & AV_EF_EXPLODE)
++ return ret;
++
++ return 0;
++}
++
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++ int err_recognition, int apply_defdispwin, void *logctx)
++{
++ int ret = 0;
++ GetByteContext gb;
++
++ bytestream2_init(&gb, data, size);
++
++ if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
++ /* It seems the extradata is encoded as hvcC format.
++ * Temporarily, we support configurationVersion==0 until 14496-15 3rd
++ * is finalized. When finalized, configurationVersion will be 1 and we
++ * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
++ int i, j, num_arrays, nal_len_size;
++
++ *is_nalff = 1;
++
++ bytestream2_skip(&gb, 21);
++ nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
++ num_arrays = bytestream2_get_byte(&gb);
++
++ /* nal units in the hvcC always have length coded with 2 bytes,
++ * so put a fake nal_length_size = 2 while parsing them */
++ *nal_length_size = 2;
++
++ /* Decode nal units from hvcC. */
++ for (i = 0; i < num_arrays; i++) {
++ int type = bytestream2_get_byte(&gb) & 0x3f;
++ int cnt = bytestream2_get_be16(&gb);
++
++ for (j = 0; j < cnt; j++) {
++ // +2 for the nal size field
++ int nalsize = bytestream2_peek_be16(&gb) + 2;
++ if (bytestream2_get_bytes_left(&gb) < nalsize) {
++ av_log(logctx, AV_LOG_ERROR,
++ "Invalid NAL unit size in extradata.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
++ *nal_length_size, err_recognition, apply_defdispwin,
++ logctx);
++ if (ret < 0) {
++ av_log(logctx, AV_LOG_ERROR,
++ "Decoding nal unit %d %d from hvcC failed\n",
++ type, i);
++ return ret;
++ }
++ bytestream2_skip(&gb, nalsize);
++ }
++ }
++
++ /* Now store right nal length size, that will be used to parse
++ * all other nals */
++ *nal_length_size = nal_len_size;
++ } else {
++ *is_nalff = 0;
++ ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
++ err_recognition, apply_defdispwin, logctx);
++ if (ret < 0)
++ return ret;
++ }
++
++ return ret;
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.h
+@@ -0,0 +1,36 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * H.265 parser code
++ */
++
++#ifndef AVCODEC_RPI_HEVC_PARSE_H
++#define AVCODEC_RPI_HEVC_PARSE_H
++
++#include <stdint.h>
++
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++ int err_recognition, int apply_defdispwin, void *logctx);
++
++#endif /* AVCODEC_RPI_HEVC_PARSE_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.c
+@@ -0,0 +1,1938 @@
++/*
++ * HEVC Parameter Set decoding
++ *
++ * Copyright (C) 2012 - 2103 Guillaume Martres
++ * Copyright (C) 2012 - 2103 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/imgutils.h"
++#include "golomb.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevcdec.h"
++
++static const uint8_t default_scaling_list_intra[] = {
++ 16, 16, 16, 16, 17, 18, 21, 24,
++ 16, 16, 16, 16, 17, 19, 22, 25,
++ 16, 16, 17, 18, 20, 22, 25, 29,
++ 16, 16, 18, 21, 24, 27, 31, 36,
++ 17, 17, 20, 24, 30, 35, 41, 47,
++ 18, 19, 22, 27, 35, 44, 54, 65,
++ 21, 22, 25, 31, 41, 54, 70, 88,
++ 24, 25, 29, 36, 47, 65, 88, 115
++};
++
++static const uint8_t default_scaling_list_inter[] = {
++ 16, 16, 16, 16, 17, 18, 20, 24,
++ 16, 16, 16, 17, 18, 20, 24, 25,
++ 16, 16, 17, 18, 20, 24, 25, 28,
++ 16, 17, 18, 20, 24, 25, 28, 33,
++ 17, 18, 20, 24, 25, 28, 33, 41,
++ 18, 20, 24, 25, 28, 33, 41, 54,
++ 20, 24, 25, 28, 33, 41, 54, 71,
++ 24, 25, 28, 33, 41, 54, 71, 91
++};
++
++static const AVRational vui_sar[] = {
++ { 0, 1 },
++ { 1, 1 },
++ { 12, 11 },
++ { 10, 11 },
++ { 16, 11 },
++ { 40, 33 },
++ { 24, 11 },
++ { 20, 11 },
++ { 32, 11 },
++ { 80, 33 },
++ { 18, 11 },
++ { 15, 11 },
++ { 64, 33 },
++ { 160, 99 },
++ { 4, 3 },
++ { 3, 2 },
++ { 2, 1 },
++};
++
++
++// pps_cb_qp_offset: -12,+12
++// slice_cb_qp_offset: -12,+12 also
++// "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
++// cr_qp_offset_list[n]: -12,+12
++// So worst case total offset: -24,+24
++
++#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
++#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
++#define M(B,n) C(B,(-n))
++
++// Sizeof the QP_START_BLOCK
++#define QP_OFFSET_0 (8*6 + 12*2)
++#define QP_START(B) \
++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++\
++ M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
++ M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
++ M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
++ M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
++ M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
++ M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
++ M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
++ M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
++#define QP_END(B) \
++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
++
++#define T1(B)\
++{\
++ QP_START(B),\
++ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
++ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
++ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
++ C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
++ C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
++ C(B,44), C(B,45),\
++ C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
++ QP_END(B)\
++}
++#define T0(B)\
++{\
++ QP_START(B),\
++ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
++ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
++ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
++ C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
++ C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
++ C(B,50), C(B,51),\
++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++ QP_END(B)\
++}
++
++#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
++
++static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
++static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
++
++#undef T
++#undef C
++#undef QP_END
++
++#define C(B,n) ((n)<0?0:(n)>51?51:(n))
++// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
++#define QP_DBLK_OFFSET_0 QP_OFFSET_0
++#define QP_END(B)\
++ 51, 51, 51, 51, 51, 51
++
++// These don't need all the padding we have here (12 top/bottom would be enough)
++static const uint8_t qp_c_dblk_0[] = T0(0);
++static const uint8_t qp_c_dblk_1[] = T1(0);
++
++#undef T
++#undef M
++#undef C
++#undef QP_END
++#undef QP_START
++
++
++static void remove_pps(HEVCRpiParamSets * const s, const int id)
++{
++ if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
++ s->pps = NULL;
++ av_buffer_unref(&s->pps_list[id]);
++}
++
++static void remove_sps(HEVCRpiParamSets * const s, const int id)
++{
++ int i;
++ if (s->sps_list[id]) {
++ if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
++ s->sps = NULL;
++
++ /* drop all PPS that depend on this SPS */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
++ if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
++ remove_pps(s, i);
++
++ av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
++ }
++ av_buffer_unref(&s->sps_list[id]);
++}
++
++static void remove_vps(HEVCRpiParamSets * const s, const int id)
++{
++ int i;
++ if (s->vps_list[id]) {
++ if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
++ s->vps = NULL;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
++ if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
++ remove_sps(s, i);
++ }
++ av_buffer_unref(&s->vps_list[id]);
++}
++
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
++ ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
++{
++ uint8_t rps_predict = 0;
++ int delta_poc;
++ int k0 = 0;
++ int k1 = 0;
++ int k = 0;
++ int i;
++
++ if (rps != sps->st_rps && sps->nb_st_rps)
++ rps_predict = get_bits1(gb);
++
++ if (rps_predict) {
++ const ShortTermRPS *rps_ridx;
++ int delta_rps;
++ unsigned abs_delta_rps;
++ uint8_t use_delta_flag = 0;
++ uint8_t delta_rps_sign;
++
++ if (is_slice_header) {
++ unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
++ if (delta_idx > sps->nb_st_rps) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
++ delta_idx, sps->nb_st_rps);
++ return AVERROR_INVALIDDATA;
++ }
++ rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
++ rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
++ } else
++ rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
++
++ delta_rps_sign = get_bits1(gb);
++ abs_delta_rps = get_ue_golomb_long(gb) + 1;
++ if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid value of abs_delta_rps: %d\n",
++ abs_delta_rps);
++ return AVERROR_INVALIDDATA;
++ }
++ delta_rps = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
++ for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
++ int used = rps->used[k] = get_bits1(gb);
++
++ if (!used)
++ use_delta_flag = get_bits1(gb);
++
++ if (used || use_delta_flag) {
++ if (i < rps_ridx->num_delta_pocs)
++ delta_poc = delta_rps + rps_ridx->delta_poc[i];
++ else
++ delta_poc = delta_rps;
++ rps->delta_poc[k] = delta_poc;
++ if (delta_poc < 0)
++ k0++;
++ else
++ k1++;
++ k++;
++ }
++ }
++
++ if (k >= FF_ARRAY_ELEMS(rps->used)) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid num_delta_pocs: %d\n", k);
++ return AVERROR_INVALIDDATA;
++ }
++
++ rps->num_delta_pocs = k;
++ rps->num_negative_pics = k0;
++ // sort in increasing order (smallest first)
++ if (rps->num_delta_pocs != 0) {
++ int used, tmp;
++ for (i = 1; i < rps->num_delta_pocs; i++) {
++ delta_poc = rps->delta_poc[i];
++ used = rps->used[i];
++ for (k = i - 1; k >= 0; k--) {
++ tmp = rps->delta_poc[k];
++ if (delta_poc < tmp) {
++ rps->delta_poc[k + 1] = tmp;
++ rps->used[k + 1] = rps->used[k];
++ rps->delta_poc[k] = delta_poc;
++ rps->used[k] = used;
++ }
++ }
++ }
++ }
++ if ((rps->num_negative_pics >> 1) != 0) {
++ int used;
++ k = rps->num_negative_pics - 1;
++ // flip the negative values to largest first
++ for (i = 0; i < rps->num_negative_pics >> 1; i++) {
++ delta_poc = rps->delta_poc[i];
++ used = rps->used[i];
++ rps->delta_poc[i] = rps->delta_poc[k];
++ rps->used[i] = rps->used[k];
++ rps->delta_poc[k] = delta_poc;
++ rps->used[k] = used;
++ k--;
++ }
++ }
++ } else {
++ unsigned int prev, nb_positive_pics;
++ rps->num_negative_pics = get_ue_golomb_long(gb);
++ nb_positive_pics = get_ue_golomb_long(gb);
++
++ if (rps->num_negative_pics >= HEVC_MAX_REFS ||
++ nb_positive_pics >= HEVC_MAX_REFS) {
++ av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
++ if (rps->num_delta_pocs) {
++ prev = 0;
++ for (i = 0; i < rps->num_negative_pics; i++) {
++ delta_poc = get_ue_golomb_long(gb) + 1;
++ if (delta_poc < 1 || delta_poc > 32768) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid value of delta_poc: %d\n",
++ delta_poc);
++ return AVERROR_INVALIDDATA;
++ }
++ prev -= delta_poc;
++ rps->delta_poc[i] = prev;
++ rps->used[i] = get_bits1(gb);
++ }
++ prev = 0;
++ for (i = 0; i < nb_positive_pics; i++) {
++ delta_poc = get_ue_golomb_long(gb) + 1;
++ if (delta_poc < 1 || delta_poc > 32768) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid value of delta_poc: %d\n",
++ delta_poc);
++ return AVERROR_INVALIDDATA;
++ }
++ prev += delta_poc;
++ rps->delta_poc[rps->num_negative_pics + i] = prev;
++ rps->used[rps->num_negative_pics + i] = get_bits1(gb);
++ }
++ }
++ }
++ return 0;
++}
++
++
++static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
++ PTLCommon * const ptl)
++{
++ int i;
++
++ if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
++ return -1;
++
++ ptl->profile_space = get_bits(gb, 2);
++ ptl->tier_flag = get_bits1(gb);
++ ptl->profile_idc = get_bits(gb, 5);
++ if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
++ av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
++ av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
++ av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
++ else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
++ av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
++ else
++ av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
++
++ for (i = 0; i < 32; i++) {
++ ptl->profile_compatibility_flag[i] = get_bits1(gb);
++
++ if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
++ ptl->profile_idc = i;
++ }
++ ptl->progressive_source_flag = get_bits1(gb);
++ ptl->interlaced_source_flag = get_bits1(gb);
++ ptl->non_packed_constraint_flag = get_bits1(gb);
++ ptl->frame_only_constraint_flag = get_bits1(gb);
++
++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
++ skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
++
++ return 0;
++}
++
++static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
++ PTL * const ptl, const int max_num_sub_layers)
++{
++ int i;
++ if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
++ get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
++ av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
++ return -1;
++ }
++
++ ptl->general_ptl.level_idc = get_bits(gb, 8);
++
++ for (i = 0; i < max_num_sub_layers - 1; i++) {
++ ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
++ ptl->sub_layer_level_present_flag[i] = get_bits1(gb);
++ }
++
++ if (max_num_sub_layers - 1> 0)
++ for (i = max_num_sub_layers - 1; i < 8; i++)
++ skip_bits(gb, 2); // reserved_zero_2bits[i]
++ for (i = 0; i < max_num_sub_layers - 1; i++) {
++ if (ptl->sub_layer_profile_present_flag[i] &&
++ decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
++ av_log(avctx, AV_LOG_ERROR,
++ "PTL information for sublayer %i too short\n", i);
++ return -1;
++ }
++ if (ptl->sub_layer_level_present_flag[i]) {
++ if (get_bits_left(gb) < 8) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Not enough data for sublayer %i level_idc\n", i);
++ return -1;
++ } else
++ ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
++ }
++ }
++
++ return 0;
++}
++
++static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
++ const int subpic_params_present)
++{
++ int i;
++
++ for (i = 0; i < nb_cpb; i++) {
++ get_ue_golomb_long(gb); // bit_rate_value_minus1
++ get_ue_golomb_long(gb); // cpb_size_value_minus1
++
++ if (subpic_params_present) {
++ get_ue_golomb_long(gb); // cpb_size_du_value_minus1
++ get_ue_golomb_long(gb); // bit_rate_du_value_minus1
++ }
++ skip_bits1(gb); // cbr_flag
++ }
++}
++
++static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
++ const int max_sublayers)
++{
++ int nal_params_present = 0, vcl_params_present = 0;
++ int subpic_params_present = 0;
++ int i;
++
++ if (common_inf_present) {
++ nal_params_present = get_bits1(gb);
++ vcl_params_present = get_bits1(gb);
++
++ if (nal_params_present || vcl_params_present) {
++ subpic_params_present = get_bits1(gb);
++
++ if (subpic_params_present) {
++ skip_bits(gb, 8); // tick_divisor_minus2
++ skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
++ skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
++ skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
++ }
++
++ skip_bits(gb, 4); // bit_rate_scale
++ skip_bits(gb, 4); // cpb_size_scale
++
++ if (subpic_params_present)
++ skip_bits(gb, 4); // cpb_size_du_scale
++
++ skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
++ skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
++ skip_bits(gb, 5); // dpb_output_delay_length_minus1
++ }
++ }
++
++ for (i = 0; i < max_sublayers; i++) {
++ int low_delay = 0;
++ unsigned int nb_cpb = 1;
++ int fixed_rate = get_bits1(gb);
++
++ if (!fixed_rate)
++ fixed_rate = get_bits1(gb);
++
++ if (fixed_rate)
++ get_ue_golomb_long(gb); // elemental_duration_in_tc_minus1
++ else
++ low_delay = get_bits1(gb);
++
++ if (!low_delay) {
++ nb_cpb = get_ue_golomb_long(gb) + 1;
++ if (nb_cpb < 1 || nb_cpb > 32) {
++ av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ if (nal_params_present)
++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
++ if (vcl_params_present)
++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
++ }
++ return 0;
++}
++
++int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
++ HEVCRpiParamSets * const ps)
++{
++ int i,j;
++ int vps_id = 0;
++ ptrdiff_t nal_size;
++ HEVCRpiVPS *vps;
++ AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
++
++ if (!vps_buf)
++ return AVERROR(ENOMEM);
++ vps = (HEVCRpiVPS*)vps_buf->data;
++
++ av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
++
++ nal_size = gb->buffer_end - gb->buffer;
++ if (nal_size > sizeof(vps->data)) {
++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++ nal_size, sizeof(vps->data));
++ vps->data_size = sizeof(vps->data);
++ } else {
++ vps->data_size = nal_size;
++ }
++ memcpy(vps->data, gb->buffer, vps->data_size);
++
++ vps_id = get_bits(gb, 4);
++ if (vps_id >= HEVC_MAX_VPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
++ goto err;
++ }
++
++ if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
++ goto err;
++ }
++
++ vps->vps_max_layers = get_bits(gb, 6) + 1;
++ vps->vps_max_sub_layers = get_bits(gb, 3) + 1;
++ vps->vps_temporal_id_nesting_flag = get_bits1(gb);
++
++ if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
++ goto err;
++ }
++
++ if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++ av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
++ vps->vps_max_sub_layers);
++ goto err;
++ }
++
++ if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
++ goto err;
++
++ vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
++
++ i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
++ for (; i < vps->vps_max_sub_layers; i++) {
++ vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
++ vps->vps_num_reorder_pics[i] = get_ue_golomb_long(gb);
++ vps->vps_max_latency_increase[i] = get_ue_golomb_long(gb) - 1;
++
++ if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
++ av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
++ vps->vps_max_dec_pic_buffering[i] - 1);
++ goto err;
++ }
++ if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
++ av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
++ vps->vps_num_reorder_pics[i]);
++ if (avctx->err_recognition & AV_EF_EXPLODE)
++ goto err;
++ }
++ }
++
++ vps->vps_max_layer_id = get_bits(gb, 6);
++ vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
++ if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
++ (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
++ av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
++ goto err;
++ }
++
++ for (i = 1; i < vps->vps_num_layer_sets; i++)
++ for (j = 0; j <= vps->vps_max_layer_id; j++)
++ skip_bits(gb, 1); // layer_id_included_flag[i][j]
++
++ vps->vps_timing_info_present_flag = get_bits1(gb);
++ if (vps->vps_timing_info_present_flag) {
++ vps->vps_num_units_in_tick = get_bits_long(gb, 32);
++ vps->vps_time_scale = get_bits_long(gb, 32);
++ vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
++ if (vps->vps_poc_proportional_to_timing_flag)
++ vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
++ vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
++ if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
++ av_log(avctx, AV_LOG_ERROR,
++ "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
++ goto err;
++ }
++ for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
++ int common_inf_present = 1;
++
++ get_ue_golomb_long(gb); // hrd_layer_set_idx
++ if (i)
++ common_inf_present = get_bits1(gb);
++ decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
++ }
++ }
++ get_bits1(gb); /* vps_extension_flag */
++
++ if (get_bits_left(gb) < 0) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Overread VPS by %d bits\n", -get_bits_left(gb));
++ if (ps->vps_list[vps_id])
++ goto err;
++ }
++
++ if (ps->vps_list[vps_id] &&
++ !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
++ av_buffer_unref(&vps_buf);
++ } else {
++ remove_vps(ps, vps_id);
++ ps->vps_list[vps_id] = vps_buf;
++ }
++
++ return 0;
++
++err:
++ av_buffer_unref(&vps_buf);
++ return AVERROR_INVALIDDATA;
++}
++
++static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
++ const int apply_defdispwin, HEVCRpiSPS * const sps)
++{
++ VUI backup_vui, * const vui = &sps->vui;
++ GetBitContext backup;
++ int sar_present, alt = 0;
++
++ av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
++
++ sar_present = get_bits1(gb);
++ if (sar_present) {
++ uint8_t sar_idx = get_bits(gb, 8);
++ if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
++ vui->sar = vui_sar[sar_idx];
++ else if (sar_idx == 255) {
++ vui->sar.num = get_bits(gb, 16);
++ vui->sar.den = get_bits(gb, 16);
++ } else
++ av_log(avctx, AV_LOG_WARNING,
++ "Unknown SAR index: %u.\n", sar_idx);
++ }
++
++ vui->overscan_info_present_flag = get_bits1(gb);
++ if (vui->overscan_info_present_flag)
++ vui->overscan_appropriate_flag = get_bits1(gb);
++
++ vui->video_signal_type_present_flag = get_bits1(gb);
++ if (vui->video_signal_type_present_flag) {
++ vui->video_format = get_bits(gb, 3);
++ vui->video_full_range_flag = get_bits1(gb);
++ vui->colour_description_present_flag = get_bits1(gb);
++ if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
++ sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
++ if (vui->colour_description_present_flag) {
++ vui->colour_primaries = get_bits(gb, 8);
++ vui->transfer_characteristic = get_bits(gb, 8);
++ vui->matrix_coeffs = get_bits(gb, 8);
++
++ // Set invalid values to "unspecified"
++ if (!av_color_primaries_name(vui->colour_primaries))
++ vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
++ if (!av_color_transfer_name(vui->transfer_characteristic))
++ vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
++ if (!av_color_space_name(vui->matrix_coeffs))
++ vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
++ if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
++ switch (sps->pix_fmt) {
++ case AV_PIX_FMT_YUV444P:
++ sps->pix_fmt = AV_PIX_FMT_GBRP;
++ break;
++ case AV_PIX_FMT_YUV444P10:
++ sps->pix_fmt = AV_PIX_FMT_GBRP10;
++ break;
++ case AV_PIX_FMT_YUV444P12:
++ sps->pix_fmt = AV_PIX_FMT_GBRP12;
++ break;
++ }
++ }
++ }
++ }
++
++ vui->chroma_loc_info_present_flag = get_bits1(gb);
++ if (vui->chroma_loc_info_present_flag) {
++ vui->chroma_sample_loc_type_top_field = get_ue_golomb_long(gb);
++ vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
++ }
++
++ vui->neutra_chroma_indication_flag = get_bits1(gb);
++ vui->field_seq_flag = get_bits1(gb);
++ vui->frame_field_info_present_flag = get_bits1(gb);
++
++ // Backup context in case an alternate header is detected
++ memcpy(&backup, gb, sizeof(backup));
++ memcpy(&backup_vui, vui, sizeof(backup_vui));
++ if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
++ vui->default_display_window_flag = 0;
++ av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
++ } else
++ vui->default_display_window_flag = get_bits1(gb);
++
++ if (vui->default_display_window_flag) {
++ int vert_mult = 1 + (sps->chroma_format_idc < 2);
++ int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++ vui->def_disp_win.left_offset = get_ue_golomb_long(gb) * horiz_mult;
++ vui->def_disp_win.right_offset = get_ue_golomb_long(gb) * horiz_mult;
++ vui->def_disp_win.top_offset = get_ue_golomb_long(gb) * vert_mult;
++ vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult;
++
++ if (apply_defdispwin &&
++ avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++ av_log(avctx, AV_LOG_DEBUG,
++ "discarding vui default display window, "
++ "original values are l:%u r:%u t:%u b:%u\n",
++ vui->def_disp_win.left_offset,
++ vui->def_disp_win.right_offset,
++ vui->def_disp_win.top_offset,
++ vui->def_disp_win.bottom_offset);
++
++ vui->def_disp_win.left_offset =
++ vui->def_disp_win.right_offset =
++ vui->def_disp_win.top_offset =
++ vui->def_disp_win.bottom_offset = 0;
++ }
++ }
++
++timing_info:
++ vui->vui_timing_info_present_flag = get_bits1(gb);
++
++ if (vui->vui_timing_info_present_flag) {
++ if( get_bits_left(gb) < 66 && !alt) {
++ // The alternate syntax seem to have timing info located
++ // at where def_disp_win is normally located
++ av_log(avctx, AV_LOG_WARNING,
++ "Strange VUI timing information, retrying...\n");
++ memcpy(vui, &backup_vui, sizeof(backup_vui));
++ memcpy(gb, &backup, sizeof(backup));
++ alt = 1;
++ goto timing_info;
++ }
++ vui->vui_num_units_in_tick = get_bits_long(gb, 32);
++ vui->vui_time_scale = get_bits_long(gb, 32);
++ if (alt) {
++ av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
++ vui->vui_time_scale, vui->vui_num_units_in_tick);
++ }
++ vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
++ if (vui->vui_poc_proportional_to_timing_flag)
++ vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
++ vui->vui_hrd_parameters_present_flag = get_bits1(gb);
++ if (vui->vui_hrd_parameters_present_flag)
++ decode_hrd(gb, 1, sps->max_sub_layers);
++ }
++
++ vui->bitstream_restriction_flag = get_bits1(gb);
++ if (vui->bitstream_restriction_flag) {
++ if (get_bits_left(gb) < 8 && !alt) {
++ av_log(avctx, AV_LOG_WARNING,
++ "Strange VUI bitstream restriction information, retrying"
++ " from timing information...\n");
++ memcpy(vui, &backup_vui, sizeof(backup_vui));
++ memcpy(gb, &backup, sizeof(backup));
++ alt = 1;
++ goto timing_info;
++ }
++ vui->tiles_fixed_structure_flag = get_bits1(gb);
++ vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
++ vui->restricted_ref_pic_lists_flag = get_bits1(gb);
++ vui->min_spatial_segmentation_idc = get_ue_golomb_long(gb);
++ vui->max_bytes_per_pic_denom = get_ue_golomb_long(gb);
++ vui->max_bits_per_min_cu_denom = get_ue_golomb_long(gb);
++ vui->log2_max_mv_length_horizontal = get_ue_golomb_long(gb);
++ vui->log2_max_mv_length_vertical = get_ue_golomb_long(gb);
++ }
++
++ if (get_bits_left(gb) < 1 && !alt) {
++ // XXX: Alternate syntax when sps_range_extension_flag != 0?
++ av_log(avctx, AV_LOG_WARNING,
++ "Overread in VUI, retrying from timing information...\n");
++ memcpy(vui, &backup_vui, sizeof(backup_vui));
++ memcpy(gb, &backup, sizeof(backup));
++ alt = 1;
++ goto timing_info;
++ }
++}
++
++static void set_default_scaling_list_data(ScalingList * const sl)
++{
++ int matrixId;
++
++ for (matrixId = 0; matrixId < 6; matrixId++) {
++ // 4x4 default is 16
++ memset(sl->sl[0][matrixId], 16, 16);
++ sl->sl_dc[0][matrixId] = 16; // default for 16x16
++ sl->sl_dc[1][matrixId] = 16; // default for 32x32
++ }
++
++ memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
++ memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
++ memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
++
++ memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
++ memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
++ memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
++
++ memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
++ memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
++ memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
++
++ memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
++ memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
++ memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
++
++ memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
++ memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
++ memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
++
++ memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
++ memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
++ memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
++}
++
++static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
++ const HEVCRpiSPS * const sps)
++{
++ uint8_t scaling_list_pred_mode_flag;
++ int32_t scaling_list_dc_coef[2][6];
++ int size_id, matrix_id, pos;
++ int i;
++
++ for (size_id = 0; size_id < 4; size_id++)
++ for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
++ scaling_list_pred_mode_flag = get_bits1(gb);
++ if (!scaling_list_pred_mode_flag) {
++ unsigned int delta = get_ue_golomb_long(gb);
++ /* Only need to handle non-zero delta. Zero means default,
++ * which should already be in the arrays. */
++ if (delta) {
++ // Copy from previous array.
++ delta *= (size_id == 3) ? 3 : 1;
++ if (matrix_id < delta) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Invalid delta in scaling list data: %d.\n", delta);
++ return AVERROR_INVALIDDATA;
++ }
++
++ memcpy(sl->sl[size_id][matrix_id],
++ sl->sl[size_id][matrix_id - delta],
++ size_id > 0 ? 64 : 16);
++ if (size_id > 1)
++ sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
++ }
++ } else {
++ int next_coef, coef_num;
++ int32_t scaling_list_delta_coef;
++
++ next_coef = 8;
++ coef_num = FFMIN(64, 1 << (4 + (size_id << 1)));
++ if (size_id > 1) {
++ scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
++ next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
++ sl->sl_dc[size_id - 2][matrix_id] = next_coef;
++ }
++ for (i = 0; i < coef_num; i++) {
++ if (size_id == 0)
++ pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
++ ff_hevc_rpi_diag_scan4x4_x[i];
++ else
++ pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
++ ff_hevc_rpi_diag_scan8x8_x[i];
++
++ scaling_list_delta_coef = get_se_golomb(gb);
++ next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
++ sl->sl[size_id][matrix_id][pos] = next_coef;
++ }
++ }
++ }
++
++ if (sps->chroma_format_idc == 3) {
++ for (i = 0; i < 64; i++) {
++ sl->sl[3][1][i] = sl->sl[2][1][i];
++ sl->sl[3][2][i] = sl->sl[2][2][i];
++ sl->sl[3][4][i] = sl->sl[2][4][i];
++ sl->sl[3][5][i] = sl->sl[2][5][i];
++ }
++ sl->sl_dc[1][1] = sl->sl_dc[0][1];
++ sl->sl_dc[1][2] = sl->sl_dc[0][2];
++ sl->sl_dc[1][4] = sl->sl_dc[0][4];
++ sl->sl_dc[1][5] = sl->sl_dc[0][5];
++ }
++
++
++ return 0;
++}
++
++static int map_pixel_format(HEVCRpiSPS * const sps)
++{
++ const int cfmt = sps->chroma_format_idc;
++
++ sps->pix_fmt = AV_PIX_FMT_NONE;
++ switch (sps->bit_depth) {
++ case 8:
++ if (cfmt == 1)
++ sps->pix_fmt = AV_PIX_FMT_SAND128;
++ break;
++ case 10:
++ if (cfmt == 1)
++ sps->pix_fmt = AV_PIX_FMT_SAND64_10;
++ break;
++ default:
++ break;
++ }
++
++ sps->hshift[0] = sps->vshift[0] = 0;
++ sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
++ sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
++
++ sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
++
++ return 0;
++}
++
++static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
++ const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
++{
++ HEVCRpiWindow *ow;
++ int ret = 0;
++ int log2_diff_max_min_transform_block_size;
++ int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
++ int i;
++
++ // Coded parameters
++
++ sps->vps_id = get_bits(gb, 4);
++ if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (vps_list && !vps_list[sps->vps_id]) {
++ av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
++ sps->vps_id);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sps->max_sub_layers = get_bits(gb, 3) + 1;
++ if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++ av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
++ sps->max_sub_layers);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sps->temporal_id_nesting_flag = get_bits(gb, 1);
++
++ if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
++ return ret;
++
++ *sps_id = get_ue_golomb_long(gb);
++ if (*sps_id >= HEVC_MAX_SPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sps->chroma_format_idc = get_ue_golomb_long(gb);
++ if (sps->chroma_format_idc > 3U) {
++ av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->chroma_format_idc == 3)
++ sps->separate_colour_plane_flag = get_bits1(gb);
++
++ if (sps->separate_colour_plane_flag)
++ sps->chroma_format_idc = 0;
++
++ sps->width = get_ue_golomb_long(gb);
++ sps->height = get_ue_golomb_long(gb);
++ if ((ret = av_image_check_size(sps->width,
++ sps->height, 0, avctx)) < 0)
++ return ret;
++
++ if (get_bits1(gb)) { // pic_conformance_flag
++ int vert_mult = 1 + (sps->chroma_format_idc < 2);
++ int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++ sps->pic_conf_win.left_offset = get_ue_golomb_long(gb) * horiz_mult;
++ sps->pic_conf_win.right_offset = get_ue_golomb_long(gb) * horiz_mult;
++ sps->pic_conf_win.top_offset = get_ue_golomb_long(gb) * vert_mult;
++ sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult;
++
++ if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++ av_log(avctx, AV_LOG_DEBUG,
++ "discarding sps conformance window, "
++ "original values are l:%u r:%u t:%u b:%u\n",
++ sps->pic_conf_win.left_offset,
++ sps->pic_conf_win.right_offset,
++ sps->pic_conf_win.top_offset,
++ sps->pic_conf_win.bottom_offset);
++
++ sps->pic_conf_win.left_offset =
++ sps->pic_conf_win.right_offset =
++ sps->pic_conf_win.top_offset =
++ sps->pic_conf_win.bottom_offset = 0;
++ }
++ sps->output_window = sps->pic_conf_win;
++ }
++
++ sps->bit_depth = get_ue_golomb_long(gb) + 8;
++ bit_depth_chroma = get_ue_golomb_long(gb) + 8;
++ if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Luma bit depth (%d) is different from chroma bit depth (%d), "
++ "this is unsupported.\n",
++ sps->bit_depth, bit_depth_chroma);
++ return AVERROR_INVALIDDATA;
++ }
++
++ ret = map_pixel_format(sps);
++ if (ret < 0)
++ return ret;
++
++ sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
++ if (sps->log2_max_poc_lsb > 16) {
++ av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
++ sps->log2_max_poc_lsb - 4);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sublayer_ordering_info = get_bits1(gb);
++ start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
++ for (i = start; i < sps->max_sub_layers; i++) {
++ sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
++ sps->temporal_layer[i].num_reorder_pics = get_ue_golomb_long(gb);
++ sps->temporal_layer[i].max_latency_increase = get_ue_golomb_long(gb) - 1;
++ if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
++ av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
++ sps->temporal_layer[i].max_dec_pic_buffering - 1U);
++ return AVERROR_INVALIDDATA;
++ }
++ if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
++ av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
++ sps->temporal_layer[i].num_reorder_pics);
++ if (avctx->err_recognition & AV_EF_EXPLODE ||
++ sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
++ return AVERROR_INVALIDDATA;
++ }
++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
++ }
++ }
++
++ if (!sublayer_ordering_info) {
++ for (i = 0; i < start; i++) {
++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
++ sps->temporal_layer[i].num_reorder_pics = sps->temporal_layer[start].num_reorder_pics;
++ sps->temporal_layer[i].max_latency_increase = sps->temporal_layer[start].max_latency_increase;
++ }
++ }
++
++ sps->log2_min_cb_size = get_ue_golomb_long(gb) + 3;
++ sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
++ sps->log2_min_tb_size = get_ue_golomb_long(gb) + 2;
++ log2_diff_max_min_transform_block_size = get_ue_golomb_long(gb);
++ sps->log2_max_trafo_size = log2_diff_max_min_transform_block_size +
++ sps->log2_min_tb_size;
++
++ if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->log2_diff_max_min_coding_block_size > 30) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ {
++ const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
++ // Not a bitstream limitation, but all profiles
++ if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ // Inferred parameters
++ sps->log2_ctb_size = CtbLog2SizeY;
++// sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
++ }
++
++ sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
++ sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
++
++ sps->scaling_list_enable_flag = get_bits1(gb);
++ if (sps->scaling_list_enable_flag) {
++ set_default_scaling_list_data(&sps->scaling_list);
++
++ if (get_bits1(gb)) {
++ ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
++ if (ret < 0)
++ return ret;
++ }
++ }
++
++ sps->amp_enabled_flag = get_bits1(gb);
++ sps->sao_enabled = get_bits1(gb);
++
++ // Set pcm defaults (0) so we don't have to test _enabled when we
++ // want to use them
++ memset(&sps->pcm, 0, sizeof(sps->pcm));
++
++ if (get_bits1(gb)) // pcm_enabled_flag
++ {
++ const unsigned int limit_max_pcm = FFMIN(5,
++ sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
++ sps->pcm.bit_depth = get_bits(gb, 4) + 1;
++ sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
++ sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
++ sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
++ get_ue_golomb_long(gb);
++ if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
++ av_log(avctx, AV_LOG_ERROR,
++ "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
++ sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
++ return AVERROR_INVALIDDATA;
++ }
++ if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
++ sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
++ av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
++ sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sps->pcm.loop_filter_disable_flag = get_bits1(gb);
++ }
++
++ // Could be based on min_pcm_cb_size but much easier logic if we just stick
++ // with 8 (and costs us little)
++ sps->pcm_width = (sps->width + 63) >> 6; // 8 for min size, 8 bits per byte - round up
++ sps->pcm_height = (sps->height + 7) >> 3;
++
++ sps->nb_st_rps = get_ue_golomb_long(gb);
++ if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
++ av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
++ sps->nb_st_rps);
++ return AVERROR_INVALIDDATA;
++ }
++ for (i = 0; i < sps->nb_st_rps; i++) {
++ if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
++ sps, 0)) < 0)
++ return ret;
++ }
++
++ sps->long_term_ref_pics_present_flag = get_bits1(gb);
++ if (sps->long_term_ref_pics_present_flag) {
++ sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
++ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
++ av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
++ sps->num_long_term_ref_pics_sps);
++ return AVERROR_INVALIDDATA;
++ }
++ for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
++ sps->lt_ref_pic_poc_lsb_sps[i] = get_bits(gb, sps->log2_max_poc_lsb);
++ sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
++ }
++ }
++
++ sps->sps_temporal_mvp_enabled_flag = get_bits1(gb);
++ sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag
++ sps->vui.sar = (AVRational){0, 1};
++ vui_present = get_bits1(gb);
++ if (vui_present)
++ decode_vui(gb, avctx, apply_defdispwin, sps);
++
++ if (get_bits1(gb)) { // sps_extension_flag
++ int sps_extension_flag[1];
++ for (i = 0; i < 1; i++)
++ sps_extension_flag[i] = get_bits1(gb);
++ skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
++ if (sps_extension_flag[0]) {
++ int extended_precision_processing_flag;
++ int cabac_bypass_alignment_enabled_flag;
++
++ sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
++ sps->transform_skip_context_enabled_flag = get_bits1(gb);
++ sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
++
++ sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
++
++ extended_precision_processing_flag = get_bits1(gb);
++ if (extended_precision_processing_flag)
++ av_log(avctx, AV_LOG_WARNING,
++ "extended_precision_processing_flag not yet implemented\n");
++
++ if (get_bits1(gb)) // sps->intra_smoothing_disabled_flag
++ sps->intra_filters_disable |= FILTER_EITHER;
++ sps->high_precision_offsets_enabled_flag = get_bits1(gb);
++ sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
++
++ cabac_bypass_alignment_enabled_flag = get_bits1(gb);
++ if (cabac_bypass_alignment_enabled_flag)
++ av_log(avctx, AV_LOG_WARNING,
++ "cabac_bypass_alignment_enabled_flag not yet implemented\n");
++ }
++ }
++ if (apply_defdispwin) {
++ sps->output_window.left_offset += sps->vui.def_disp_win.left_offset;
++ sps->output_window.right_offset += sps->vui.def_disp_win.right_offset;
++ sps->output_window.top_offset += sps->vui.def_disp_win.top_offset;
++ sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
++ }
++
++ ow = &sps->output_window;
++ if (ow->left_offset >= INT_MAX - ow->right_offset ||
++ ow->top_offset >= INT_MAX - ow->bottom_offset ||
++ ow->left_offset + ow->right_offset >= sps->width ||
++ ow->top_offset + ow->bottom_offset >= sps->height) {
++ av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
++ ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
++ if (avctx->err_recognition & AV_EF_EXPLODE) {
++ return AVERROR_INVALIDDATA;
++ }
++ av_log(avctx, AV_LOG_WARNING,
++ "Displaying the whole video surface.\n");
++ memset(ow, 0, sizeof(*ow));
++ memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
++ }
++
++ // Inferred parameters
++
++ sps->ctb_width = (sps->width + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++ sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++ sps->ctb_size = sps->ctb_width * sps->ctb_height;
++
++ sps->min_cb_width = sps->width >> sps->log2_min_cb_size;
++ sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
++ sps->min_tb_width = sps->width >> sps->log2_min_tb_size;
++ sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
++ sps->min_pu_width = sps->width >> LOG2_MIN_PU_SIZE;
++ sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
++ sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
++
++ sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
++ sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
++
++ if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
++ av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
++ sps->max_transform_hierarchy_depth_inter);
++ return AVERROR_INVALIDDATA;
++ }
++ if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
++ sps->max_transform_hierarchy_depth_intra);
++ return AVERROR_INVALIDDATA;
++ }
++ if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
++ av_log(avctx, AV_LOG_ERROR,
++ "max transform block size out of range: %d\n",
++ sps->log2_max_trafo_size);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (get_bits_left(gb) < 0) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Overread SPS by %d bits\n", -get_bits_left(gb));
++ return AVERROR_INVALIDDATA;
++ }
++
++ return 0;
++}
++
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++ HEVCRpiParamSets *ps, int apply_defdispwin)
++{
++ HEVCRpiSPS *sps;
++ AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
++ unsigned int sps_id;
++ int ret;
++ ptrdiff_t nal_size;
++
++ if (!sps_buf)
++ return AVERROR(ENOMEM);
++ sps = (HEVCRpiSPS*)sps_buf->data;
++
++ av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
++
++ nal_size = gb->buffer_end - gb->buffer;
++ if (nal_size > sizeof(sps->data)) {
++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++ nal_size, sizeof(sps->data));
++ sps->data_size = sizeof(sps->data);
++ } else {
++ sps->data_size = nal_size;
++ }
++ memcpy(sps->data, gb->buffer, sps->data_size);
++
++ ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
++ apply_defdispwin,
++ ps->vps_list, avctx);
++ if (ret < 0) {
++ av_buffer_unref(&sps_buf);
++ return ret;
++ }
++
++ if (avctx->debug & FF_DEBUG_BITSTREAM) {
++ av_log(avctx, AV_LOG_DEBUG,
++ "Parsed SPS: id %d; coded wxh: %dx%d; "
++ "cropped wxh: %dx%d; pix_fmt: %s.\n",
++ sps_id, sps->width, sps->height,
++ sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
++ sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
++ av_get_pix_fmt_name(sps->pix_fmt));
++ }
++
++ /* check if this is a repeat of an already parsed SPS, then keep the
++ * original one.
++ * otherwise drop all PPSes that depend on it */
++ if (ps->sps_list[sps_id] &&
++ !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
++ av_buffer_unref(&sps_buf);
++ } else {
++ remove_sps(ps, sps_id);
++ ps->sps_list[sps_id] = sps_buf;
++ }
++
++ return 0;
++}
++
++static void hevc_pps_free(void *opaque, uint8_t *data)
++{
++ HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
++
++ av_freep(&pps->column_width);
++ av_freep(&pps->row_height);
++ av_freep(&pps->col_bd);
++ av_freep(&pps->row_bd);
++ av_freep(&pps->col_idxX);
++ av_freep(&pps->ctb_addr_rs_to_ts);
++ av_freep(&pps->ctb_addr_ts_to_rs);
++ av_freep(&pps->tile_pos_ts);
++ av_freep(&pps->tile_size);
++ av_freep(&pps->tile_id);
++ av_freep(&pps->ctb_ts_flags);
++
++ av_freep(&pps);
++}
++
++static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
++{
++ do
++ {
++ const int offset = get_se_golomb_long(gb);
++ if (offset < -12 || offset > 12) {
++ av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
++ return AVERROR_INVALIDDATA;
++ }
++ *offsets++ = offset;
++ } while (n_minus_1-- != 0);
++ return 0;
++}
++
++static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
++ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
++{
++ if (pps->transform_skip_enabled_flag) {
++ pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
++ }
++ pps->cross_component_prediction_enabled_flag = get_bits1(gb);
++ if (pps->cross_component_prediction_enabled_flag &&
++ (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
++ {
++ av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
++ return AVERROR_INVALIDDATA;
++ }
++ pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
++ if (pps->chroma_qp_offset_list_enabled_flag) {
++ int err;
++
++ pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
++ pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
++ if (pps->chroma_qp_offset_list_len_minus1 > 5) {
++ av_log(avctx, AV_LOG_ERROR,
++ "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
++ return AVERROR_INVALIDDATA;
++ }
++ av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
++
++ if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
++ (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
++ return err;
++ }
++
++ {
++ const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
++
++ pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
++ if (pps->log2_sao_offset_scale_luma > max_offset) {
++ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
++ return AVERROR_INVALIDDATA;
++ }
++ pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
++ if (pps->log2_sao_offset_scale_chroma > max_offset) {
++ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ return(0);
++}
++
++static inline int setup_pps(AVCodecContext * const avctx,
++ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
++{
++ int pic_area_in_ctbs;
++ int i, j, x, y, ctb_addr_rs, tile_id;
++
++ // Inferred parameters
++
++ // qp_y -> qp_u/qp_v tables
++ // The tables have at least -24,+24 overrun after adding offset here
++ // which should allow for clipless offseting
++
++ pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0; // No offset for luma, but may be useful for general code
++ pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
++
++ if (sps->chroma_format_idc == 1) {
++ pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
++ pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
++ pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
++ pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
++ }
++ else
++ {
++ pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
++ pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
++ pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
++ pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
++ }
++
++ pps->col_bd = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
++ pps->row_bd = av_malloc_array(pps->num_tile_rows + 1, sizeof(*pps->row_bd));
++ pps->col_idxX = av_malloc_array(sps->ctb_width, sizeof(*pps->col_idxX));
++ if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
++ return AVERROR(ENOMEM);
++
++ if (pps->uniform_spacing_flag) {
++ if (!pps->column_width) {
++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height));
++ }
++ if (!pps->column_width || !pps->row_height)
++ return AVERROR(ENOMEM);
++
++ for (i = 0; i < pps->num_tile_columns; i++) {
++ pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
++ (i * sps->ctb_width) / pps->num_tile_columns;
++ }
++
++ for (i = 0; i < pps->num_tile_rows; i++) {
++ pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
++ (i * sps->ctb_height) / pps->num_tile_rows;
++ }
++ }
++
++ {
++ const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
++ pps->col_bd[0] = 0;
++ pps->tile_wpp_inter_disable = 0;
++ for (i = 0; i < pps->num_tile_columns; i++)
++ {
++ pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
++
++ // Avoid trying tile parallel if the columns don't fall on cache boundries
++ // (this causes too much pain syncing flushes with the QPU)
++ // Ignore the final (RHS of pic) tile boundry
++ if ((pps->col_bd[i] & td_mask) != 0) {
++ pps->tile_wpp_inter_disable = 1;
++ }
++ }
++
++ // If we can start the next row before finishing the first line of
++ // this one then we must wait at the end of the tile
++ // * if this happens a lot then there are better but more complicated
++ // conditions that we could apply
++ if (pps->tile_wpp_inter_disable) {
++ for (i = 0; i < pps->num_tile_rows; i++)
++ {
++ if (pps->row_height[i] <= RPI_MAX_JOBS) {
++ pps->tile_wpp_inter_disable = 2;
++ break;
++ }
++ }
++ }
++ }
++
++ pps->row_bd[0] = 0;
++ for (i = 0; i < pps->num_tile_rows; i++)
++ pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
++
++ for (i = 0, j = 0; i < sps->ctb_width; i++) {
++ if (i >= pps->col_bd[j + 1])
++ j++;
++ pps->col_idxX[i] = j;
++ }
++
++ /**
++ * 6.5
++ */
++ pic_area_in_ctbs = sps->ctb_size;
++
++ pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts));
++ pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs));
++ pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id));
++ pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
++ pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
++ pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags));
++ if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
++ !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
++ return AVERROR(ENOMEM);
++ }
++
++ memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
++
++ for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
++ int tb_x = ctb_addr_rs % sps->ctb_width;
++ int tb_y = ctb_addr_rs / sps->ctb_width;
++ int tile_x = 0;
++ int tile_y = 0;
++ int val = 0;
++
++ for (i = 0; i < pps->num_tile_columns; i++) {
++ if (tb_x < pps->col_bd[i + 1]) {
++ tile_x = i;
++ break;
++ }
++ }
++
++ for (i = 0; i < pps->num_tile_rows; i++) {
++ if (tb_y < pps->row_bd[i + 1]) {
++ tile_y = i;
++ break;
++ }
++ }
++
++ for (i = 0; i < tile_x; i++)
++ val += pps->row_height[tile_y] * pps->column_width[i];
++ for (i = 0; i < tile_y; i++)
++ val += sps->ctb_width * pps->row_height[i];
++
++ val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
++ tb_x - pps->col_bd[tile_x];
++
++ pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
++ pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs;
++ }
++
++ {
++ uint8_t * pflags = pps->ctb_ts_flags;
++ uint16_t * ptid = pps->tile_id;
++
++ for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
++ {
++ for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
++ {
++ const unsigned int tile_w = pps->column_width[i];
++
++ pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++ for (x = 0; x != tile_w; ++x) {
++ pflags[x] |= CTB_TS_FLAGS_TOT;
++ }
++
++ for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
++ {
++ pflags[0] |= CTB_TS_FLAGS_SOTL;
++
++ if (pps->entropy_coding_sync_enabled_flag)
++ {
++ if (pps->column_width[i] != 1)
++ pflags[1] |= CTB_TS_FLAGS_CSAVE;
++ else
++ pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++ if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
++ pflags[0] |= CTB_TS_FLAGS_CLOAD;
++ }
++
++ for (x = 0; x != tile_w; ++x)
++ *ptid++ = tile_id;
++
++ pflags += tile_w;
++ pflags[-1] |= CTB_TS_FLAGS_EOTL;
++ if (i + 1 == pps->num_tile_columns)
++ pflags[-1] |= CTB_TS_FLAGS_EOL;
++ }
++
++ pflags[-1] |= CTB_TS_FLAGS_EOT;
++ }
++ }
++ }
++
++ {
++ unsigned int ts = 0;
++ for (j = 0; j < pps->num_tile_rows; j++)
++ for (i = 0; i < pps->num_tile_columns; i++)
++ {
++ const unsigned int size = pps->column_width[i] * pps->row_height[j];
++ pps->tile_size[j * pps->num_tile_columns + i] = size;
++ pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
++ ts += size;
++ }
++ }
++
++ return 0;
++}
++
++int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
++ HEVCRpiParamSets * const ps)
++{
++ const HEVCRpiSPS *sps = NULL;
++ int i, ret = 0;
++ unsigned int pps_id = 0;
++ ptrdiff_t nal_size;
++ unsigned log2_parallel_merge_level_minus2;
++
++ AVBufferRef *pps_buf;
++ HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
++
++ if (!pps)
++ return AVERROR(ENOMEM);
++
++ pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
++ hevc_pps_free, NULL, 0);
++ if (!pps_buf) {
++ av_freep(&pps);
++ return AVERROR(ENOMEM);
++ }
++
++ av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
++
++ nal_size = gb->buffer_end - gb->buffer;
++ if (nal_size > sizeof(pps->data)) {
++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++ nal_size, sizeof(pps->data));
++ pps->data_size = sizeof(pps->data);
++ } else {
++ pps->data_size = nal_size;
++ }
++ memcpy(pps->data, gb->buffer, pps->data_size);
++
++ // Default values
++ pps->loop_filter_across_tiles_enabled_flag = 1;
++ pps->num_tile_columns = 1;
++ pps->num_tile_rows = 1;
++ pps->uniform_spacing_flag = 1;
++ pps->disable_dbf = 0;
++ pps->beta_offset = 0;
++ pps->tc_offset = 0;
++ pps->log2_max_transform_skip_block_size = 2;
++
++ // Coded parameters
++ pps_id = get_ue_golomb_long(gb);
++ if (pps_id >= HEVC_MAX_PPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->sps_id = get_ue_golomb_long(gb);
++ if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ if (!ps->sps_list[pps->sps_id]) {
++ av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
++
++ pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
++ pps->output_flag_present_flag = get_bits1(gb);
++ pps->num_extra_slice_header_bits = get_bits(gb, 3);
++
++ pps->sign_data_hiding_flag = get_bits1(gb);
++
++ pps->cabac_init_present_flag = get_bits1(gb);
++
++ pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
++ if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
++ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
++ if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
++ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ pps->pic_init_qp_minus26 = get_se_golomb(gb);
++ if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
++ av_log(avctx, AV_LOG_ERROR,
++ "init_qp_minus26 %d is outside the valid range "
++ "[%d, %d].\n",
++ pps->pic_init_qp_minus26,
++ -(26 + sps->qp_bd_offset), 25);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ pps->constrained_intra_pred_flag = get_bits1(gb);
++ pps->transform_skip_enabled_flag = get_bits1(gb);
++
++ pps->cu_qp_delta_enabled_flag = get_bits1(gb);
++ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
++ if (pps->cu_qp_delta_enabled_flag)
++ {
++ const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
++
++ if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
++ av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
++ diff_cu_qp_delta_depth);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
++ }
++
++ pps->cb_qp_offset = get_se_golomb(gb);
++ if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
++ av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
++ pps->cb_qp_offset);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->cr_qp_offset = get_se_golomb(gb);
++ if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
++ av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
++ pps->cr_qp_offset);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
++
++ pps->weighted_pred_flag = get_bits1(gb);
++ pps->weighted_bipred_flag = get_bits1(gb);
++
++ pps->transquant_bypass_enable_flag = get_bits1(gb);
++ pps->tiles_enabled_flag = get_bits1(gb);
++ pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
++
++ if (pps->tiles_enabled_flag) {
++ pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
++ pps->num_tile_rows = get_ue_golomb_long(gb) + 1;
++ if (pps->num_tile_columns <= 0 ||
++ pps->num_tile_columns >= sps->width) {
++ av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
++ pps->num_tile_columns - 1);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ if (pps->num_tile_rows <= 0 ||
++ pps->num_tile_rows >= sps->height) {
++ av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
++ pps->num_tile_rows - 1);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height));
++ if (!pps->column_width || !pps->row_height) {
++ ret = AVERROR(ENOMEM);
++ goto err;
++ }
++
++ pps->uniform_spacing_flag = get_bits1(gb);
++ if (!pps->uniform_spacing_flag) {
++ uint64_t sum = 0;
++ for (i = 0; i < pps->num_tile_columns - 1; i++) {
++ pps->column_width[i] = get_ue_golomb_long(gb) + 1;
++ sum += pps->column_width[i];
++ }
++ if (sum >= sps->ctb_width) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
++
++ sum = 0;
++ for (i = 0; i < pps->num_tile_rows - 1; i++) {
++ pps->row_height[i] = get_ue_golomb_long(gb) + 1;
++ sum += pps->row_height[i];
++ }
++ if (sum >= sps->ctb_height) {
++ av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
++ }
++ pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
++ }
++
++ pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++
++ pps->deblocking_filter_control_present_flag = get_bits1(gb);
++ if (pps->deblocking_filter_control_present_flag) {
++ pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
++ pps->disable_dbf = get_bits1(gb);
++ if (!pps->disable_dbf) {
++ int beta_offset_div2 = get_se_golomb(gb);
++ int tc_offset_div2 = get_se_golomb(gb) ;
++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
++ av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
++ beta_offset_div2);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
++ av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
++ tc_offset_div2);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->beta_offset = 2 * beta_offset_div2;
++ pps->tc_offset = 2 * tc_offset_div2;
++ }
++ }
++
++ pps->scaling_list_data_present_flag = get_bits1(gb);
++ if (pps->scaling_list_data_present_flag) {
++ set_default_scaling_list_data(&pps->scaling_list);
++ ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
++ if (ret < 0)
++ goto err;
++ }
++ pps->lists_modification_present_flag = get_bits1(gb);
++ log2_parallel_merge_level_minus2 = get_ue_golomb_long(gb);
++ if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
++ av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
++ log2_parallel_merge_level_minus2);
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++ pps->log2_parallel_merge_level = log2_parallel_merge_level_minus2 + 2;
++
++ pps->slice_header_extension_present_flag = get_bits1(gb);
++
++ if (get_bits1(gb)) { // pps_extension_present_flag
++ int pps_range_extensions_flag = get_bits1(gb);
++ skip_bits(gb, 7); // pps_extension_7bits
++ if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
++ if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
++ goto err;
++ }
++ }
++
++ ret = setup_pps(avctx, pps, sps);
++ if (ret < 0)
++ goto err;
++
++ if (get_bits_left(gb) < 0) {
++ av_log(avctx, AV_LOG_ERROR,
++ "Overread PPS by %d bits\n", -get_bits_left(gb));
++ ret = AVERROR_INVALIDDATA;
++ goto err;
++ }
++
++ remove_pps(ps, pps_id);
++ ps->pps_list[pps_id] = pps_buf;
++
++ return 0;
++
++err:
++ av_buffer_unref(&pps_buf);
++ return ret;
++}
++
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
++{
++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb;
++ int prev_poc_lsb = pocTid0 % max_poc_lsb;
++ int prev_poc_msb = pocTid0 - prev_poc_lsb;
++ int poc_msb;
++
++ if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
++ poc_msb = prev_poc_msb + max_poc_lsb;
++ else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
++ poc_msb = prev_poc_msb - max_poc_lsb;
++ else
++ poc_msb = prev_poc_msb;
++
++ // For BLA picture types, POCmsb is set to 0.
++ if (nal_unit_type == HEVC_NAL_BLA_W_LP ||
++ nal_unit_type == HEVC_NAL_BLA_W_RADL ||
++ nal_unit_type == HEVC_NAL_BLA_N_LP)
++ poc_msb = 0;
++
++ return poc_msb + poc_lsb;
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.h
+@@ -0,0 +1,449 @@
++/*
++ * HEVC parameter set parsing
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_PS_H
++#define AVCODEC_RPI_HEVC_PS_H
++
++#include <stdint.h>
++
++#include "libavutil/buffer.h"
++#include "libavutil/pixfmt.h"
++#include "libavutil/rational.h"
++
++#include "avcodec.h"
++#include "get_bits.h"
++#include "hevc.h"
++
++typedef struct ShortTermRPS {
++ unsigned int num_negative_pics;
++ int num_delta_pocs;
++ int rps_idx_num_delta_pocs;
++ int32_t delta_poc[32];
++ uint8_t used[32];
++} ShortTermRPS;
++
++typedef struct LongTermRPS {
++ int poc[32];
++ uint8_t used[32];
++ uint8_t nb_refs;
++} LongTermRPS;
++
++typedef struct RpiSliceHeader {
++ unsigned int pps_id;
++
++ ///< address (in raster order) of the first block in the current slice segment
++ unsigned int slice_segment_addr;
++ ///< address (in raster order) of the first block in the current slice
++ unsigned int slice_addr;
++
++ enum HEVCSliceType slice_type;
++
++ int pic_order_cnt_lsb;
++
++ uint8_t first_slice_in_pic_flag;
++ uint8_t dependent_slice_segment_flag;
++ uint8_t pic_output_flag;
++ uint8_t colour_plane_id;
++
++ ///< RPS coded in the slice header itself is stored here
++ int short_term_ref_pic_set_sps_flag;
++ int short_term_ref_pic_set_size;
++ ShortTermRPS slice_rps;
++ const ShortTermRPS *short_term_rps;
++ int long_term_ref_pic_set_size;
++ LongTermRPS long_term_rps;
++ unsigned int list_entry_lx[2][32];
++
++ uint8_t rpl_modification_flag[2];
++ uint8_t no_output_of_prior_pics_flag;
++ uint8_t slice_temporal_mvp_enabled_flag;
++
++ unsigned int nb_refs[2];
++
++ uint8_t slice_sample_adaptive_offset_flag[3];
++ uint8_t mvd_l1_zero_flag;
++
++ uint8_t cabac_init_flag;
++ uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
++ uint8_t slice_loop_filter_across_slices_enabled_flag;
++ uint8_t collocated_list;
++
++ uint8_t no_dblk_boundary_flags;
++
++ unsigned int collocated_ref_idx;
++
++ int slice_qp_delta;
++ int slice_cb_qp_offset; // -12, +12
++ int slice_cr_qp_offset; // -12, +12
++
++ uint8_t cu_chroma_qp_offset_enabled_flag;
++
++ int beta_offset; ///< beta_offset_div2 * 2
++ int tc_offset; ///< tc_offset_div2 * 2
++
++ unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
++
++ unsigned *entry_point_offset;
++ int * offset;
++ int * size;
++ int num_entry_point_offsets;
++ int offsets_allocated;
++
++ uint8_t offload_wpp;
++ uint8_t offload_tiles;
++
++ int8_t slice_qp;
++
++ uint8_t luma_log2_weight_denom;
++ uint8_t chroma_log2_weight_denom;
++
++ int16_t luma_weight_l0[16]; // -128, +255
++ int16_t luma_offset_l0[16];
++ int16_t chroma_weight_l0[16][2];
++ int16_t chroma_offset_l0[16][2];
++
++ int16_t luma_weight_l1[16];
++ int16_t luma_offset_l1[16];
++ int16_t chroma_weight_l1[16][2];
++ int16_t chroma_offset_l1[16][2];
++
++} RpiSliceHeader;
++
++typedef struct HEVCRpiWindow {
++ uint16_t left_offset;
++ uint16_t right_offset;
++ uint16_t top_offset;
++ uint16_t bottom_offset;
++} HEVCRpiWindow;
++
++typedef struct VUI {
++ AVRational sar;
++
++ int overscan_info_present_flag;
++ int overscan_appropriate_flag;
++
++ int video_signal_type_present_flag;
++ int video_format;
++ int video_full_range_flag;
++ int colour_description_present_flag;
++ uint8_t colour_primaries;
++ uint8_t transfer_characteristic;
++ uint8_t matrix_coeffs;
++
++ int chroma_loc_info_present_flag;
++ int chroma_sample_loc_type_top_field;
++ int chroma_sample_loc_type_bottom_field;
++ int neutra_chroma_indication_flag;
++
++ int field_seq_flag;
++ int frame_field_info_present_flag;
++
++ int default_display_window_flag;
++ HEVCRpiWindow def_disp_win;
++
++ int vui_timing_info_present_flag;
++ uint32_t vui_num_units_in_tick;
++ uint32_t vui_time_scale;
++ int vui_poc_proportional_to_timing_flag;
++ int vui_num_ticks_poc_diff_one_minus1;
++ int vui_hrd_parameters_present_flag;
++
++ int bitstream_restriction_flag;
++ int tiles_fixed_structure_flag;
++ int motion_vectors_over_pic_boundaries_flag;
++ int restricted_ref_pic_lists_flag;
++ int min_spatial_segmentation_idc;
++ int max_bytes_per_pic_denom;
++ int max_bits_per_min_cu_denom;
++ int log2_max_mv_length_horizontal;
++ int log2_max_mv_length_vertical;
++} VUI;
++
++typedef struct PTLCommon {
++ uint8_t profile_space;
++ uint8_t tier_flag;
++ uint8_t profile_idc;
++ uint8_t profile_compatibility_flag[32];
++ uint8_t level_idc;
++ uint8_t progressive_source_flag;
++ uint8_t interlaced_source_flag;
++ uint8_t non_packed_constraint_flag;
++ uint8_t frame_only_constraint_flag;
++} PTLCommon;
++
++typedef struct PTL {
++ PTLCommon general_ptl;
++ PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
++
++ uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
++ uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
++} PTL;
++
++typedef struct HEVCRpiVPS {
++ uint8_t vps_temporal_id_nesting_flag;
++ int vps_max_layers;
++ int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
++
++ PTL ptl;
++ int vps_sub_layer_ordering_info_present_flag;
++ unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
++ unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
++ unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
++ int vps_max_layer_id;
++ int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
++ uint8_t vps_timing_info_present_flag;
++ uint32_t vps_num_units_in_tick;
++ uint32_t vps_time_scale;
++ uint8_t vps_poc_proportional_to_timing_flag;
++ int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
++ int vps_num_hrd_parameters;
++
++ uint8_t data[4096];
++ int data_size;
++} HEVCRpiVPS;
++
++typedef struct ScalingList {
++ /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
++ * and size ID 3 only has 2 arrays, not 6. */
++ uint8_t sl[4][6][64];
++ uint8_t sl_dc[2][6];
++} ScalingList;
++
++typedef struct HEVCRpiSPS {
++ unsigned vps_id;
++ uint8_t chroma_format_idc;
++ uint8_t separate_colour_plane_flag;
++
++ HEVCRpiWindow output_window;
++
++ HEVCRpiWindow pic_conf_win;
++
++ uint16_t wp_offset_half_range; // WpOffsetHalfRange
++
++ uint8_t bit_depth;
++
++// int bit_depth_chroma; // We only support lum_bit_depth = chroma_bit_depth
++ uint8_t pixel_shift;
++ enum AVPixelFormat pix_fmt;
++
++ unsigned int log2_max_poc_lsb;
++
++ int max_sub_layers;
++ struct {
++ int max_dec_pic_buffering;
++ int num_reorder_pics;
++ int max_latency_increase;
++ } temporal_layer[HEVC_MAX_SUB_LAYERS];
++ uint8_t temporal_id_nesting_flag;
++
++ uint8_t scaling_list_enable_flag;
++ ScalingList scaling_list;
++
++ unsigned int nb_st_rps;
++ ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
++
++ uint8_t amp_enabled_flag;
++ uint8_t sao_enabled;
++
++ uint8_t long_term_ref_pics_present_flag;
++ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
++ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
++ uint8_t num_long_term_ref_pics_sps;
++
++ struct {
++ uint8_t bit_depth;
++ uint8_t bit_depth_chroma;
++ uint8_t log2_min_pcm_cb_size;
++ uint8_t log2_max_pcm_cb_size;
++ uint8_t loop_filter_disable_flag;
++ } pcm;
++ char sps_temporal_mvp_enabled_flag;
++// char sps_strong_intra_smoothing_enable_flag; -> intra_filtes_disable
++
++ uint8_t log2_min_cb_size; // 3..6
++ uint8_t log2_diff_max_min_coding_block_size;
++ uint8_t log2_min_tb_size; // 2..5
++ uint8_t log2_max_trafo_size;
++ uint8_t log2_ctb_size; // 4..6
++// unsigned int log2_min_pu_size; // 2..5 (min_cb_size - 1)
++#define LOG2_MIN_PU_SIZE 2
++#define LOG2_MIN_CU_SIZE 3
++
++ uint8_t max_transform_hierarchy_depth_inter;
++ uint8_t max_transform_hierarchy_depth_intra;
++
++ char transform_skip_rotation_enabled_flag;
++ char transform_skip_context_enabled_flag;
++ char implicit_rdpcm_enabled_flag;
++ char explicit_rdpcm_enabled_flag;
++// char intra_smoothing_disabled_flag; -> intra_filtes_disable
++ char high_precision_offsets_enabled_flag;
++ char persistent_rice_adaptation_enabled_flag;
++
++ uint8_t intra_filters_disable;
++
++ ///< coded frame dimension in various units
++ int width;
++ int height;
++ int ctb_width;
++ int ctb_height;
++ int ctb_size; // Pic size in CTBs not size of a CTB
++ int min_cb_width;
++ int min_cb_height;
++ int min_tb_width;
++ int min_tb_height;
++ int min_pu_width;
++ int min_pu_height;
++ int pcm_width;
++ int pcm_height;
++ int tb_mask;
++
++ int hshift[3];
++ int vshift[3];
++
++ int qp_bd_offset;
++
++ uint8_t data[4096];
++ int data_size;
++
++ VUI vui;
++ PTL ptl;
++} HEVCRpiSPS;
++
++#define CTB_TS_FLAGS_SOTL (1U << 0) // X start of tile line
++#define CTB_TS_FLAGS_EOTL (1U << 1) // Last CTB of a tile line
++#define CTB_TS_FLAGS_EOL (1U << 2) // Last CTB of a complete line
++#define CTB_TS_FLAGS_EOT (1U << 3) // Last CTB of a tile
++#define CTB_TS_FLAGS_CSAVE (1U << 4)
++#define CTB_TS_FLAGS_CIREQ (1U << 5) // Cabac init request
++#define CTB_TS_FLAGS_TOT (1U << 6) // CTB on top row of a tile
++#define CTB_TS_FLAGS_CLOAD (1U << 7)
++
++typedef struct HEVCRpiPPS {
++ unsigned int sps_id; ///< seq_parameter_set_id
++
++ uint8_t sign_data_hiding_flag;
++
++ uint8_t cabac_init_present_flag;
++
++ int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
++ int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
++ int pic_init_qp_minus26;
++
++ uint8_t constrained_intra_pred_flag;
++ uint8_t transform_skip_enabled_flag;
++
++ uint8_t cu_qp_delta_enabled_flag;
++ uint8_t log2_min_cu_qp_delta_size;
++ int cb_qp_offset; // -12..12
++ int cr_qp_offset; // -12..12
++ const uint8_t * qp_dblk_x[3];
++ const int8_t * qp_bd_x[3];
++
++ uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
++ uint8_t weighted_pred_flag;
++ uint8_t weighted_bipred_flag;
++ uint8_t output_flag_present_flag;
++ uint8_t transquant_bypass_enable_flag;
++
++ uint8_t dependent_slice_segments_enabled_flag;
++ uint8_t tiles_enabled_flag;
++ uint8_t entropy_coding_sync_enabled_flag;
++
++ uint8_t tile_wpp_inter_disable;
++ int num_tile_columns; ///< num_tile_columns_minus1 + 1
++ int num_tile_rows; ///< num_tile_rows_minus1 + 1
++ uint8_t uniform_spacing_flag;
++ uint8_t loop_filter_across_tiles_enabled_flag;
++
++ uint8_t seq_loop_filter_across_slices_enabled_flag;
++
++ uint8_t deblocking_filter_control_present_flag;
++ uint8_t deblocking_filter_override_enabled_flag;
++ uint8_t disable_dbf;
++ int beta_offset; ///< beta_offset_div2 * 2
++ int tc_offset; ///< tc_offset_div2 * 2
++
++ uint8_t scaling_list_data_present_flag;
++ ScalingList scaling_list;
++
++ uint8_t lists_modification_present_flag;
++ int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
++ int num_extra_slice_header_bits;
++ uint8_t slice_header_extension_present_flag;
++ uint8_t log2_max_transform_skip_block_size;
++ uint8_t cross_component_prediction_enabled_flag;
++ uint8_t chroma_qp_offset_list_enabled_flag;
++ uint8_t diff_cu_chroma_qp_offset_depth;
++ uint8_t chroma_qp_offset_list_len_minus1;
++ int8_t cb_qp_offset_list[6];
++ int8_t cr_qp_offset_list[6];
++ uint8_t log2_sao_offset_scale_luma;
++ uint8_t log2_sao_offset_scale_chroma;
++
++ // Inferred parameters
++ uint16_t *column_width; ///< ColumnWidth
++ uint16_t *row_height; ///< RowHeight
++ uint16_t *col_bd; ///< ColBd
++ uint16_t *row_bd; ///< RowBd
++ uint16_t *col_idxX;
++
++ // We can limit these to uint16_t given our other size limits
++ uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
++ uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
++ uint16_t *tile_id; ///< TileId
++ uint16_t *tile_pos_ts; ///< TilePosRS
++ uint16_t *tile_size; ///< TileSize
++ uint8_t * ctb_ts_flags;
++
++ uint8_t data[4096];
++ int data_size;
++} HEVCRpiPPS;
++
++typedef struct HEVCRpiParamSets {
++ /* currently active parameter sets */
++ const HEVCRpiVPS *vps;
++ const HEVCRpiSPS *sps;
++ const HEVCRpiPPS *pps;
++
++ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
++ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
++ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
++} HEVCRpiParamSets;
++
++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
++ HEVCRpiParamSets *ps);
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++ HEVCRpiParamSets *ps, int apply_defdispwin);
++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
++ HEVCRpiParamSets *ps);
++
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
++ ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
++
++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
++ uint8_t *buf, int buf_size);
++
++/**
++ * Compute POC of the current frame and return it.
++ */
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
++
++#endif /* AVCODEC_RPI_HEVC_PS_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevc_refs.c
+@@ -0,0 +1,485 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "internal.h"
++#include "thread.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
++{
++ /* frame->frame can be NULL if context init failed */
++ if (!frame->frame || !frame->frame->buf[0])
++ return;
++
++ frame->flags &= ~flags;
++ if (!frame->flags) {
++ ff_thread_release_buffer(s->avctx, &frame->tf);
++
++ av_buffer_unref(&frame->col_mvf_buf); // OK if already NULL
++ frame->col_mvf = NULL;
++
++ frame->collocated_ref = NULL;
++ }
++}
++
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
++{
++ int i;
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i],
++ HEVC_FRAME_FLAG_SHORT_REF |
++ HEVC_FRAME_FLAG_LONG_REF);
++}
++
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
++{
++ int i;
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++}
++
++static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
++{
++ int i, ret;
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame * const frame = &s->DPB[i];
++ if (frame->frame->buf[0])
++ continue;
++
++ ret = ff_thread_get_buffer(s->avctx, &frame->tf,
++ AV_GET_BUFFER_FLAG_REF);
++ if (ret < 0)
++ return NULL;
++
++ frame->col_mvf = NULL;
++ frame->col_mvf_buf = NULL;
++ if (s->used_for_ref && !s->is_irap)
++ {
++ frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
++ if (!frame->col_mvf_buf)
++ goto fail;
++ frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
++ }
++
++ frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
++ frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
++
++ return frame;
++
++fail:
++ ff_hevc_rpi_unref_frame(s, frame, ~0);
++ return NULL;
++ }
++ av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
++ return NULL;
++}
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
++{
++ HEVCRpiFrame *ref;
++ int i;
++
++ /* check that this POC doesn't already exist */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++
++ if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
++ frame->poc == poc) {
++ av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
++ poc);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ ref = alloc_frame(s);
++ if (!ref)
++ return AVERROR(ENOMEM);
++
++ *frame = ref->frame;
++ s->ref = ref;
++
++ if (s->sh.pic_output_flag)
++ ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
++ else
++ ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
++
++ ref->poc = poc;
++ ref->sequence = s->seq_decode;
++ ref->frame->crop_left = s->ps.sps->output_window.left_offset;
++ ref->frame->crop_right = s->ps.sps->output_window.right_offset;
++ ref->frame->crop_top = s->ps.sps->output_window.top_offset;
++ ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
++
++ return 0;
++}
++
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
++{
++ do {
++ int nb_output = 0;
++ int min_poc = INT_MAX;
++ int i, min_idx, ret;
++
++ if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
++ frame->sequence == s->seq_output) {
++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++ }
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
++ frame->sequence == s->seq_output) {
++ nb_output++;
++ if (frame->poc < min_poc || nb_output == 1) {
++ min_poc = frame->poc;
++ min_idx = i;
++ }
++ }
++ }
++
++ /* wait for more frames before output */
++ if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
++ nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
++ return 0;
++
++ if (nb_output) {
++ HEVCRpiFrame *frame = &s->DPB[min_idx];
++ if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
++ return 0;
++
++ ret = av_frame_ref(out, frame->frame);
++ if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
++ else
++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++ if (ret < 0)
++ return ret;
++ av_log(s->avctx, AV_LOG_DEBUG,
++ "Output frame with POC %d.\n", frame->poc);
++ return 1;
++ }
++
++ if (s->seq_output != s->seq_decode)
++ s->seq_output = (s->seq_output + 1) & 0xff;
++ else
++ break;
++ } while (1);
++
++ return 0;
++}
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
++{
++ int dpb = 0;
++ int min_poc = INT_MAX;
++ int i;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if ((frame->flags) &&
++ frame->sequence == s->seq_output &&
++ frame->poc != s->poc) {
++ dpb++;
++ }
++ }
++
++ if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if ((frame->flags) &&
++ frame->sequence == s->seq_output &&
++ frame->poc != s->poc) {
++ if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
++ min_poc = frame->poc;
++ }
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++ if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
++ frame->sequence == s->seq_output &&
++ frame->poc <= min_poc) {
++ frame->flags |= HEVC_FRAME_FLAG_BUMPING;
++ }
++ }
++
++ dpb--;
++ }
++}
++
++static int init_slice_rpl(HEVCRpiContext *s)
++{
++ if (s->slice_idx >= s->rpl_tab_size)
++ return AVERROR_INVALIDDATA;
++
++ s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
++ return 0;
++}
++
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
++{
++ RpiSliceHeader *sh = &s->sh;
++
++ uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
++ uint8_t list_idx;
++ int i, j, ret;
++
++ ret = init_slice_rpl(s);
++ if (ret < 0)
++ return ret;
++
++ if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
++ s->rps[LT_CURR].nb_refs)) {
++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ for (list_idx = 0; list_idx < nb_list; list_idx++) {
++ RefPicList rpl_tmp = { { 0 } };
++ RefPicList *rpl = &s->refPicList[list_idx];
++
++ /* The order of the elements is
++ * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
++ * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
++ int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
++ list_idx ? ST_CURR_BEF : ST_CURR_AFT,
++ LT_CURR };
++
++ /* concatenate the candidate lists for the current frame */
++ while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
++ for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
++ RefPicList *rps = &s->rps[cand_lists[i]];
++ for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
++ rpl_tmp.list[rpl_tmp.nb_refs] = rps->list[j];
++ rpl_tmp.ref[rpl_tmp.nb_refs] = rps->ref[j];
++ rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
++ rpl_tmp.nb_refs++;
++ }
++ }
++ }
++
++ /* reorder the references if necessary */
++ if (sh->rpl_modification_flag[list_idx]) {
++ for (i = 0; i < sh->nb_refs[list_idx]; i++) {
++ int idx = sh->list_entry_lx[list_idx][i];
++
++ if (idx >= rpl_tmp.nb_refs) {
++ av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ rpl->list[i] = rpl_tmp.list[idx];
++ rpl->ref[i] = rpl_tmp.ref[idx];
++ rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
++ rpl->nb_refs++;
++ }
++ } else {
++ memcpy(rpl, &rpl_tmp, sizeof(*rpl));
++ rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
++ }
++
++ if (sh->collocated_list == list_idx &&
++ sh->collocated_ref_idx < rpl->nb_refs)
++ s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
++ }
++
++ return 0;
++}
++
++static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
++{
++ int i;
++ int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *ref = &s->DPB[i];
++ if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
++ if ((ref->poc & LtMask) == poc)
++ return ref;
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *ref = &s->DPB[i];
++ if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
++ if (ref->poc == poc || (ref->poc & LtMask) == poc)
++ return ref;
++ }
++ }
++
++ if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Could not find ref with POC %d\n", poc);
++ return NULL;
++}
++
++static void mark_ref(HEVCRpiFrame *frame, int flag)
++{
++ frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
++ frame->flags |= flag;
++}
++
++static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
++{
++ HEVCRpiFrame *frame;
++ int i, x, y;
++
++ frame = alloc_frame(s);
++ if (!frame)
++ return NULL;
++
++ if (!s->ps.sps->pixel_shift) {
++ for (i = 0; frame->frame->buf[i]; i++)
++ memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
++ frame->frame->buf[i]->size);
++ } else {
++ for (i = 0; frame->frame->data[i]; i++)
++ for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
++ for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
++ AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
++ 1 << (s->ps.sps->bit_depth - 1));
++ }
++ }
++
++ frame->poc = poc;
++ frame->sequence = s->seq_decode;
++ frame->flags = 0;
++
++ ff_hevc_rpi_progress_set_all_done(frame);
++
++ return frame;
++}
++
++/* add a reference with the given poc to the list and mark it as used in DPB */
++static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
++ int poc, int ref_flag)
++{
++ HEVCRpiFrame *ref = find_ref_idx(s, poc);
++
++ if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
++ return AVERROR_INVALIDDATA;
++
++ if (!ref) {
++ ref = generate_missing_ref(s, poc);
++ if (!ref)
++ return AVERROR(ENOMEM);
++ }
++
++ list->list[list->nb_refs] = ref->poc;
++ list->ref[list->nb_refs] = ref;
++ list->nb_refs++;
++
++ mark_ref(ref, ref_flag);
++ return 0;
++}
++
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
++{
++ const ShortTermRPS *short_rps = s->sh.short_term_rps;
++ const LongTermRPS *long_rps = &s->sh.long_term_rps;
++ RefPicList *rps = s->rps;
++ int i, ret = 0;
++
++ if (!short_rps) {
++ rps[0].nb_refs = rps[1].nb_refs = 0;
++ return 0;
++ }
++
++ /* clear the reference flags on all frames except the current one */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ HEVCRpiFrame *frame = &s->DPB[i];
++
++ if (frame == s->ref)
++ continue;
++
++ mark_ref(frame, 0);
++ }
++
++ for (i = 0; i < NB_RPS_TYPE; i++)
++ rps[i].nb_refs = 0;
++
++ /* add the short refs */
++ for (i = 0; i < short_rps->num_delta_pocs; i++) {
++ int poc = s->poc + short_rps->delta_poc[i];
++ int list;
++
++ if (!short_rps->used[i])
++ list = ST_FOLL;
++ else if (i < short_rps->num_negative_pics)
++ list = ST_CURR_BEF;
++ else
++ list = ST_CURR_AFT;
++
++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
++ if (ret < 0)
++ goto fail;
++ }
++
++ /* add the long refs */
++ for (i = 0; i < long_rps->nb_refs; i++) {
++ int poc = long_rps->poc[i];
++ int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
++
++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
++ if (ret < 0)
++ goto fail;
++ }
++
++fail:
++ /* release any frames that are now unused */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
++
++ return ret;
++}
++
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
++{
++ int ret = 0;
++ int i;
++ const ShortTermRPS *rps = s->sh.short_term_rps;
++ LongTermRPS *long_rps = &s->sh.long_term_rps;
++
++ if (rps) {
++ for (i = 0; i < rps->num_negative_pics; i++)
++ ret += !!rps->used[i];
++ for (; i < rps->num_delta_pocs; i++)
++ ret += !!rps->used[i];
++ }
++
++ if (long_rps) {
++ for (i = 0; i < long_rps->nb_refs; i++)
++ ret += !!long_rps->used[i];
++ }
++ return ret;
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.c
+@@ -0,0 +1,368 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "golomb.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++
++static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
++{
++ int cIdx, i;
++ uint8_t hash_type;
++ //uint16_t picture_crc;
++ //uint32_t picture_checksum;
++ hash_type = get_bits(gb, 8);
++
++ for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
++ if (hash_type == 0) {
++ s->is_md5 = 1;
++ for (i = 0; i < 16; i++)
++ s->md5[cIdx][i] = get_bits(gb, 8);
++ } else if (hash_type == 1) {
++ // picture_crc = get_bits(gb, 16);
++ skip_bits(gb, 16);
++ } else if (hash_type == 2) {
++ // picture_checksum = get_bits_long(gb, 32);
++ skip_bits(gb, 32);
++ }
++ }
++ return 0;
++}
++
++static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
++{
++ int i;
++ // Mastering primaries
++ for (i = 0; i < 3; i++) {
++ s->display_primaries[i][0] = get_bits(gb, 16);
++ s->display_primaries[i][1] = get_bits(gb, 16);
++ }
++ // White point (x, y)
++ s->white_point[0] = get_bits(gb, 16);
++ s->white_point[1] = get_bits(gb, 16);
++
++ // Max and min luminance of mastering display
++ s->max_luminance = get_bits_long(gb, 32);
++ s->min_luminance = get_bits_long(gb, 32);
++
++ // As this SEI message comes before the first frame that references it,
++ // initialize the flag to 2 and decrement on IRAP access unit so it
++ // persists for the coded video sequence (e.g., between two IRAPs)
++ s->present = 2;
++ return 0;
++}
++
++static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
++{
++ // Max and average light levels
++ s->max_content_light_level = get_bits_long(gb, 16);
++ s->max_pic_average_light_level = get_bits_long(gb, 16);
++ // As this SEI message comes before the first frame that references it,
++ // initialize the flag to 2 and decrement on IRAP access unit so it
++ // persists for the coded video sequence (e.g., between two IRAPs)
++ s->present = 2;
++ return 0;
++}
++
++static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
++{
++ get_ue_golomb_long(gb); // frame_packing_arrangement_id
++ s->present = !get_bits1(gb);
++
++ if (s->present) {
++ s->arrangement_type = get_bits(gb, 7);
++ s->quincunx_subsampling = get_bits1(gb);
++ s->content_interpretation_type = get_bits(gb, 6);
++
++ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
++ skip_bits(gb, 3);
++ s->current_frame_is_frame0_flag = get_bits1(gb);
++ // frame0_self_contained_flag, frame1_self_contained_flag
++ skip_bits(gb, 2);
++
++ if (!s->quincunx_subsampling && s->arrangement_type != 5)
++ skip_bits(gb, 16); // frame[01]_grid_position_[xy]
++ skip_bits(gb, 8); // frame_packing_arrangement_reserved_byte
++ skip_bits1(gb); // frame_packing_arrangement_persistence_flag
++ }
++ skip_bits1(gb); // upsampled_aspect_ratio_flag
++ return 0;
++}
++
++static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
++{
++ s->present = !get_bits1(gb);
++
++ if (s->present) {
++ s->hflip = get_bits1(gb); // hor_flip
++ s->vflip = get_bits1(gb); // ver_flip
++
++ s->anticlockwise_rotation = get_bits(gb, 16);
++ skip_bits1(gb); // display_orientation_persistence_flag
++ }
++
++ return 0;
++}
++
++static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
++ void *logctx, int size)
++{
++ HEVCSEIPictureTiming *h = &s->picture_timing;
++ HEVCRpiSPS *sps;
++
++ if (!ps->sps_list[s->active_seq_parameter_set_id])
++ return(AVERROR(ENOMEM));
++ sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
++
++ if (sps->vui.frame_field_info_present_flag) {
++ int pic_struct = get_bits(gb, 4);
++ h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
++ if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
++ av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
++ h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
++ } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
++ av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
++ h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
++ }
++ get_bits(gb, 2); // source_scan_type
++ get_bits(gb, 1); // duplicate_flag
++ skip_bits1(gb);
++ size--;
++ }
++ skip_bits_long(gb, 8 * size);
++
++ return 0;
++}
++
++static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
++ int size)
++{
++ int flag;
++ int user_data_type_code;
++ int cc_count;
++
++ if (size < 3)
++ return AVERROR(EINVAL);
++
++ user_data_type_code = get_bits(gb, 8);
++ if (user_data_type_code == 0x3) {
++ skip_bits(gb, 1); // reserved
++
++ flag = get_bits(gb, 1); // process_cc_data_flag
++ if (flag) {
++ skip_bits(gb, 1);
++ cc_count = get_bits(gb, 5);
++ skip_bits(gb, 8); // reserved
++ size -= 2;
++
++ if (cc_count && size >= cc_count * 3) {
++ const uint64_t new_size = (s->a53_caption_size + cc_count
++ * UINT64_C(3));
++ int i, ret;
++
++ if (new_size > INT_MAX)
++ return AVERROR(EINVAL);
++
++ /* Allow merging of the cc data from two fields. */
++ ret = av_reallocp(&s->a53_caption, new_size);
++ if (ret < 0)
++ return ret;
++
++ for (i = 0; i < cc_count; i++) {
++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++ }
++ skip_bits(gb, 8); // marker_bits
++ }
++ }
++ } else {
++ int i;
++ for (i = 0; i < size - 1; i++)
++ skip_bits(gb, 8);
++ }
++
++ return 0;
++}
++
++static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
++ int size)
++{
++ uint32_t country_code;
++ uint32_t user_identifier;
++
++ if (size < 7)
++ return AVERROR(EINVAL);
++ size -= 7;
++
++ country_code = get_bits(gb, 8);
++ if (country_code == 0xFF) {
++ skip_bits(gb, 8);
++ size--;
++ }
++
++ skip_bits(gb, 8);
++ skip_bits(gb, 8);
++
++ user_identifier = get_bits_long(gb, 32);
++
++ switch (user_identifier) {
++ case MKBETAG('G', 'A', '9', '4'):
++ return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
++ default:
++ skip_bits_long(gb, size * 8);
++ break;
++ }
++ return 0;
++}
++
++static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
++{
++ int num_sps_ids_minus1;
++ int i;
++ unsigned active_seq_parameter_set_id;
++
++ get_bits(gb, 4); // active_video_parameter_set_id
++ get_bits(gb, 1); // self_contained_cvs_flag
++ get_bits(gb, 1); // num_sps_ids_minus1
++ num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
++
++ if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
++ av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
++ return AVERROR_INVALIDDATA;
++ }
++
++ active_seq_parameter_set_id = get_ue_golomb_long(gb);
++ if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
++ av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
++ return AVERROR_INVALIDDATA;
++ }
++ s->active_seq_parameter_set_id = active_seq_parameter_set_id;
++
++ for (i = 1; i <= num_sps_ids_minus1; i++)
++ get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
++
++ return 0;
++}
++
++static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
++{
++ s->present = 1;
++ s->preferred_transfer_characteristics = get_bits(gb, 8);
++ return 0;
++}
++
++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
++ int type, int size)
++{
++ switch (type) {
++ case 256: // Mismatched value from HM 8.1
++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++ case HEVC_SEI_TYPE_FRAME_PACKING:
++ return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
++ case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
++ return decode_nal_sei_display_orientation(&s->display_orientation, gb);
++ case HEVC_SEI_TYPE_PICTURE_TIMING:
++ return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
++ case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
++ return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
++ case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
++ return decode_nal_sei_content_light_info(&s->content_light, gb);
++ case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
++ return decode_nal_sei_active_parameter_sets(s, gb, logctx);
++ case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
++ return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
++ case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
++ return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
++ default:
++ av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
++ skip_bits_long(gb, 8 * size);
++ return 0;
++ }
++}
++
++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++ int type, int size)
++{
++ switch (type) {
++ case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++ default:
++ av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
++ skip_bits_long(gb, 8 * size);
++ return 0;
++ }
++}
++
++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
++ const HEVCRpiParamSets * const ps, const int nal_unit_type)
++{
++ int payload_type = 0;
++ int payload_size = 0;
++ int byte = 0xFF;
++ av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
++
++ while (byte == 0xFF) {
++ if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
++ return AVERROR_INVALIDDATA;
++ byte = get_bits(gb, 8);
++ payload_type += byte;
++ }
++ byte = 0xFF;
++ while (byte == 0xFF) {
++ if (get_bits_left(gb) < 8 + 8LL*payload_size)
++ return AVERROR_INVALIDDATA;
++ byte = get_bits(gb, 8);
++ payload_size += byte;
++ }
++ if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
++ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
++ } else { /* nal_unit_type == NAL_SEI_SUFFIX */
++ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
++ }
++}
++
++static int more_rbsp_data(GetBitContext *gb)
++{
++ return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
++}
++
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++ const HEVCRpiParamSets *ps, int type)
++{
++ int ret;
++
++ do {
++ ret = decode_nal_sei_message(gb, logctx, s, ps, type);
++ if (ret < 0)
++ return ret;
++ } while (more_rbsp_data(gb));
++ return 1;
++}
++
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
++{
++ s->a53_caption.a53_caption_size = 0;
++ av_freep(&s->a53_caption.a53_caption);
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.h
+@@ -0,0 +1,135 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_SEI_H
++#define AVCODEC_RPI_HEVC_SEI_H
++
++#include <stdint.h>
++
++#include "libavutil/md5.h"
++
++#include "get_bits.h"
++
++/**
++ * SEI message types
++ */
++typedef enum {
++ HEVC_SEI_TYPE_BUFFERING_PERIOD = 0,
++ HEVC_SEI_TYPE_PICTURE_TIMING = 1,
++ HEVC_SEI_TYPE_PAN_SCAN_RECT = 2,
++ HEVC_SEI_TYPE_FILLER_PAYLOAD = 3,
++ HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35 = 4,
++ HEVC_SEI_TYPE_USER_DATA_UNREGISTERED = 5,
++ HEVC_SEI_TYPE_RECOVERY_POINT = 6,
++ HEVC_SEI_TYPE_SCENE_INFO = 9,
++ HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT = 15,
++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END = 17,
++ HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS = 19,
++ HEVC_SEI_TYPE_POST_FILTER_HINT = 22,
++ HEVC_SEI_TYPE_TONE_MAPPING_INFO = 23,
++ HEVC_SEI_TYPE_FRAME_PACKING = 45,
++ HEVC_SEI_TYPE_DISPLAY_ORIENTATION = 47,
++ HEVC_SEI_TYPE_SOP_DESCRIPTION = 128,
++ HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS = 129,
++ HEVC_SEI_TYPE_DECODING_UNIT_INFO = 130,
++ HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX = 131,
++ HEVC_SEI_TYPE_DECODED_PICTURE_HASH = 132,
++ HEVC_SEI_TYPE_SCALABLE_NESTING = 133,
++ HEVC_SEI_TYPE_REGION_REFRESH_INFO = 134,
++ HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO = 137,
++ HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO = 144,
++ HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
++} HEVC_SEI_Type;
++
++typedef struct HEVCSEIPictureHash {
++ uint8_t md5[3][16];
++ uint8_t is_md5;
++} HEVCSEIPictureHash;
++
++typedef struct HEVCSEIFramePacking {
++ int present;
++ int arrangement_type;
++ int content_interpretation_type;
++ int quincunx_subsampling;
++ int current_frame_is_frame0_flag;
++} HEVCSEIFramePacking;
++
++typedef struct HEVCSEIDisplayOrientation {
++ int present;
++ int anticlockwise_rotation;
++ int hflip, vflip;
++} HEVCSEIDisplayOrientation;
++
++typedef struct HEVCSEIPictureTiming {
++ int picture_struct;
++} HEVCSEIPictureTiming;
++
++typedef struct HEVCSEIA53Caption {
++ int a53_caption_size;
++ uint8_t *a53_caption;
++} HEVCSEIA53Caption;
++
++typedef struct HEVCSEIMasteringDisplay {
++ int present;
++ uint16_t display_primaries[3][2];
++ uint16_t white_point[2];
++ uint32_t max_luminance;
++ uint32_t min_luminance;
++} HEVCSEIMasteringDisplay;
++
++typedef struct HEVCSEIContentLight {
++ int present;
++ uint16_t max_content_light_level;
++ uint16_t max_pic_average_light_level;
++} HEVCSEIContentLight;
++
++typedef struct HEVCSEIAlternativeTransfer {
++ int present;
++ int preferred_transfer_characteristics;
++} HEVCSEIAlternativeTransfer;
++
++typedef struct HEVCSEIContext {
++ HEVCSEIPictureHash picture_hash;
++ HEVCSEIFramePacking frame_packing;
++ HEVCSEIDisplayOrientation display_orientation;
++ HEVCSEIPictureTiming picture_timing;
++ HEVCSEIA53Caption a53_caption;
++ HEVCSEIMasteringDisplay mastering_display;
++ HEVCSEIContentLight content_light;
++ int active_seq_parameter_set_id;
++ HEVCSEIAlternativeTransfer alternative_transfer;
++} HEVCSEIContext;
++
++struct HEVCRpiParamSets;
++
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++ const struct HEVCRpiParamSets *ps, int type);
++
++/**
++ * Reset SEI values that are stored on the Context.
++ * e.g. Caption data that was extracted during NAL
++ * parsing.
++ *
++ * @param s HEVCRpiContext.
++ */
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
++
++#endif /* AVCODEC_RPI_HEVC_SEI_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.c
+@@ -0,0 +1,1537 @@
++#include "rpi_hevc_shader.h"
++
++#ifdef _MSC_VER
++ #include <stdint.h>
++ /* cast through uintptr_t to avoid warnings */
++ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
++#else
++ #define POINTER_TO_UINT(X) ((unsigned int)(X))
++#endif
++
++#ifdef __cplusplus
++extern "C" { /* the types are probably wrong... */
++#endif
++#ifdef __cplusplus
++}
++#endif
++
++#ifdef _MSC_VER
++__declspec(align(8))
++#elif defined(__GNUC__)
++__attribute__((aligned(8)))
++#endif
++unsigned int ff_hevc_rpi_shader[] = {
++// ::mc_setup_c_q0
++// ::mc_start
++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_setup_c_qn
++/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
++/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif
++/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5
++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a
++/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif
++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000110] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif
++/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD
++/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2
++/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y
++// :1
++/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
++/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
++/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c_p
++/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
++/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
++/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
++/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
++/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
++/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
++/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
++/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
++// :1
++/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0
++/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
++/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask
++/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
++/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
++/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
++/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
++/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
++/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c_p_l1
++/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
++/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
++/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
++/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
++/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
++/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
++/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
++/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
++// :1
++/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1
++/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
++/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax
++/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
++/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
++/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
++/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
++/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
++/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c_b
++/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif
++/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif
++/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif
++/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height
++/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
++/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif
++/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
++/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif
++/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif
++/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y
++/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add
++/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4
++/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
++/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val
++/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1
++/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d
++/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif
++/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d
++// :1
++/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0
++/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next
++/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y
++/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
++/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask
++/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0
++/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1
++/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1
++/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6
++/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax
++/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0
++/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1
++/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a
++/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b
++/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b
++/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4
++/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7
++/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11
++/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
++/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
++/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1
++/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add
++/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height
++/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
++/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_sync_q0
++/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q1
++/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q2
++/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q3
++/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q4
++/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q5
++/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q6
++/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q7
++/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q8
++/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q9
++/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q10
++/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync_q11
++/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_qn
++// ::mc_exit_y_qn
++/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
++/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
++/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_q0
++// ::mc_exit_y_q0
++/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
++/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
++/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y_q0
++/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_setup_y_qn
++/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
++/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif
++/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
++/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
++/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
++/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
++/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif
++/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch
++/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
++/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
++/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a
++/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a
++// :1
++/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
++/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
++/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000df0] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0
++/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0
++/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0
++/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0
++// :per_block_setup_8
++/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif
++/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
++/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif
++/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a
++/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif
++/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif
++/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init
++/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul
++/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
++/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add
++/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val
++/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255
++/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
++/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
++/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
++/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5
++/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
++/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif
++/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5
++/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
++/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
++/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
++/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
++/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8
++/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif
++/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5
++// ::mc_filter_y_pxx
++/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
++/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++// :1
++/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
++/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
++/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
++/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
++/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
++/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
++/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
++/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
++/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
++/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height
++/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next
++/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next
++/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
++/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_bxx
++/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
++/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1
++/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++// :1
++/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
++/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
++/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
++/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
++/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
++/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
++/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
++/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
++/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
++/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
++/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off
++/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
++/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next
++/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8
++/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height
++/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch
++/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3
++/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
++/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_p00
++/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num
++/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
++/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif
++/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a
++/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif
++/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif
++/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
++/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif
++/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
++// :1
++/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
++/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
++/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
++/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
++/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
++/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_b00
++/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
++/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0
++/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
++/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1
++/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next
++/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax
++/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1
++/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
++/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
++/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_setup_c10_q0
++/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_setup_c10_qn
++/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
++/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif
++/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
++/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
++/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5
++/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
++/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a
++/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif
++/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00001770] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif
++/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD
++/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2
++/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y
++// :1
++/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
++/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
++/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c10_p
++/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
++/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
++/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
++/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
++/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
++/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
++/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
++// :1
++/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0
++/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
++/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask
++/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
++/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
++/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
++/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
++/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
++/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_p_l1
++/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
++/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
++/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
++/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
++/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
++/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
++/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
++// :1
++/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1
++/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
++/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax
++/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
++/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
++/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
++/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
++/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
++/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_b
++/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif
++/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif
++/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif
++/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
++/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height
++/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
++/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif
++/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
++/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif
++/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif
++/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y
++/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add
++/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4
++/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
++/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val
++/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1
++/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d
++/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif
++/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d
++// :1
++/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0
++/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next
++/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y
++/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
++/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask
++/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0
++/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1
++/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1
++/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6
++/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7
++/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15
++/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax
++/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0
++/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1
++/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0
++/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
++/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a
++/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b
++/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b
++/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4
++/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7
++/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11
++/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
++/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
++/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1
++/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add
++/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height
++/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
++/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_sync10_q0
++/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov dst, srel(i)
++/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q1
++/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q2
++/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q3
++/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
++/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q4
++/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q5
++/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q6
++/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q7
++/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
++/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q8
++/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q9
++/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q10
++/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i)
++/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i)
++// ::mc_sync10_q11
++/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
++/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_q0
++// ::mc_exit_y10_q0
++/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
++/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
++/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
++/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_qn
++// ::mc_exit_y10_qn
++/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
++/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
++/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y10_q0
++/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
++// ::mc_setup_y10_qn
++/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
++/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif
++/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
++/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
++/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
++/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
++/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
++/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif
++/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch
++/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
++/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
++/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a
++/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a
++// :1
++/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
++/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
++/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00002428] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0
++/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0
++/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0
++/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0
++// :per_block_setup_10
++/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif
++/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
++/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif
++/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a
++/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif
++/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif
++/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init
++/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul
++/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
++/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add
++/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val
++/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255
++/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
++/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
++/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
++/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5
++/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
++/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif
++/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5
++/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
++/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
++/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
++/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
++/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8
++/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif
++/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5
++// ::mc_filter_y10_pxx
++/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
++/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++// :1
++/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
++/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
++/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
++/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
++/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
++/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
++/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
++/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
++/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
++/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height
++/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next
++/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next
++/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
++/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
++/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_p00
++/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num
++/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
++/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif
++/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a
++/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif
++/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif
++/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
++/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif
++/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
++// :1
++/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
++/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
++/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
++/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
++/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
++/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_bxx
++/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
++/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1
++/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++// :1
++/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
++/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
++/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
++/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
++/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
++/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
++/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
++/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
++/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
++/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
++/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off
++/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
++/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next
++/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8
++/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height
++/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch
++/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3
++/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
++/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_b00
++/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
++/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
++/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
++/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0
++/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
++/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1
++/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next
++/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax
++/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1
++/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
++/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
++/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
++/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
++/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
++/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
++/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
++// ::mc_end
++};
++#ifdef __HIGHC__
++#pragma Align_to(8, ff_hevc_rpi_shader)
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.h
+@@ -0,0 +1,63 @@
++#ifndef rpi_hevc_shader_H
++#define rpi_hevc_shader_H
++
++extern unsigned int ff_hevc_rpi_shader[];
++
++#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
++#define mc_start (ff_hevc_rpi_shader + 0)
++#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
++#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
++#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
++#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
++#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
++#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
++#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
++#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
++#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
++#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
++#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
++#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
++#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
++#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
++#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
++#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
++#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
++#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
++#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
++#define mc_end (ff_hevc_rpi_shader + 2860)
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.qasm
+@@ -0,0 +1,1850 @@
++# Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++# All rights reserved.
++#
++# Redistribution and use in source and binary forms, with or without
++# modification, are permitted provided that the following conditions are met:
++# * Redistributions of source code must retain the above copyright
++# notice, this list of conditions and the following disclaimer.
++# * Redistributions in binary form must reproduce the above copyright
++# notice, this list of conditions and the following disclaimer in the
++# documentation and/or other materials provided with the distribution.
++# * Neither the name of the copyright holder nor the
++# names of its contributors may be used to endorse or promote products
++# derived from this software without specific prior written permission.
++#
++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++#
++# Written by Peter de Rivaz, John Cox
++
++
++
++# Inter pred asm
++#
++# Logic here should be good to 14 bits without modification
++# but only 8 & 10 are currently instantiated & tested
++# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
++# in _p00 & _b00
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be rotated through their
++# local 4. As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
++# Number limits in P/B calculation
++#
++# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
++# we offset our intermediates s.t. they always end up +ve before the next
++# multiply (may be -ve whilst summing but that doesn't matter).
++#
++# Range calc for up to 14 bits (Y-B pred):
++#
++# denom: [0, 7]
++# bmax = (1 << bits) - 1
++# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
++#
++# wt_mul: [-128, 255]
++# wt_off = off * 2 + 1: [-bmax, bmax]
++#
++# pel: [0, bmax]
++# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
++# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
++# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
++# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
++# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
++# [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
++#
++# This all looks good and is mostly bit depth independant - and as we manage
++# to do unsigned multiplies everywhere (now) this should be good for any bit
++# depth up to 14 (we could probably do 16 - but that requires a few tweaks
++# to the shifts we don't currently have logic for)
++
++# PREREAD is the number of requests that we have sitting in the TMU request
++# queue.
++#
++# There are 8 slots availible in the TMU request Q for tm0s requests, but
++# only 4 output FIFO entries and overflow is bad (corruption or crash)
++# (If threaded then only 2 out FIFO entries, but we aren't.)
++# In s/w we are effectively limited to the min vertical read which is >= 4
++# so output FIFO is the limit.
++#
++# As the test for read-next is is the main part of the Luma loop (rather than
++# the preload FIFO part) we are limited to min_luma_height - 1
++# Min_luma_height is 4 so we can only have a preload of 3
++# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
++# in chroma without abandoning preload pretty much entirely (which would be bad)
++#
++# Timing tests vs preload of 4 suggests this doesn't hurt us much
++# Could have preread 4 for Chroma but when tested it didn't help
++
++.set PREREAD, 3
++
++# Offset added (effectively) at the exit of the H FIR filter
++# This is enough to force the result +ve
++# Is good if it is a power of 2 as that allows for >> without loss
++#
++# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
++# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
++# Round up to next power of 2
++
++.set FIR_OFFSET, 0x4000
++
++# Block heights - 8 & 16 are the only numbers we currently support
++
++.set C_BLK_HEIGHT_8, 16
++.set C_BLK_HEIGHT_16, 8
++.set Y_BLK_HEIGHT_8, 16
++.set Y_BLK_HEIGHT_16, 8
++
++# QPU counts - depend on block size
++# If we have a 2-byte format & block_size > 8 then can only afford
++# 8 QPUs
++# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
++
++.set N_QPU_8, 12
++.set N_QPU_16, 12
++
++# Value to add to the weight multiplier to convert it into an unsigned value
++# Should be power of two for convienience
++
++.set LOG2_MUL_ADD, 14
++.set MUL_ADD, (1 << LOG2_MUL_ADD)
++
++# Fixed denom (max that it can be set to)
++.set DENOM, 7
++
++# register allocation
++#
++
++# ra0-3
++# Used as temp and may be loop filter coeffs (split into .8s)
++# or temp in loop. Check usage on an individual basis.
++
++# ra4-11
++# V FIFO / temp / free
++
++# -- free -- ra12
++
++# -- free -- ra13
++
++# -- free -- ra14
++
++# -- free -- ra15
++
++# uniform: width:height
++.set ra_width_height, ra16
++.set ra_width, ra16.16b
++.set ra_height, ra16.16a
++
++# y:y2 same layout as y_y2_next so we can update both together
++.set ra_y_y2, ra17
++.set ra_y2, ra17.16a
++.set ra_y, ra17.16b
++
++# uniform: L1 weight (U on left, V on right)
++# Only used in Y B
++.set ra_wt_off_mul_l1, ra18
++.set ra_wt_off_l1, ra18.16b
++.set ra_wt_mul_l1, ra18.16a
++
++# y_next:y2_next same layout as y_y2 so we can update both together
++.set ra_y_y2_next, ra19
++.set ra_y_next, ra19.16b
++.set ra_y2_next, ra19.16a
++
++# Setup: consts - subdivide a single register
++.set ra_kff800100, ra20
++.set ra_k256, ra20.16a
++.set ra_k0, ra20.8a
++.set ra_k1, ra20.8b
++.set ra_k128, ra20.8c
++.set ra_k255, ra20.8d
++
++# Loop: xshifts
++.set ra_xshift, ra21.16a
++.set ra_xshift_next, ra21.16b
++
++# Loop var: L0 weight (U on left, V on right)
++# _off_ is not used in loop as we want to modify it before use
++.set ra_wt_off_mul_l0, ra22
++.set ra_wt_mul_l0, ra22.16a
++.set ra_wt_off_l0, ra22.16b
++
++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
++# 2nd byte but as the source should never be > 3 there 0x3ff should do
++.set ra_blk_height_pmax, ra23
++.set ra_pmax, ra23.16a
++.set ra_blk_height, ra23.8c
++# --free -- ra23.8d
++
++# Loop: src frame base (L0)
++.set ra_base, ra24
++
++# Misc offsets
++.set ra_fir_off_val_wt_den_p7, ra25
++.set ra_wt_den_p7, ra25.8a
++# -- free -- ra25.8b
++.set ra_fir_off_val, ra25.16b
++
++# As it happens these constants are the same
++.if FIR_OFFSET == MUL_ADD
++# Weight multiplier unsigned add
++.set ra_kmul_add, ra_fir_off_val
++.else
++.error "FIR_OFFSET != MUL_ADD: Need new register & init"
++.endif
++
++# Loop: next src frame base (L0)
++.set ra_base_next, ra26
++
++# Loop: height<<23 + width<<16 + vdw_setup_0
++.set ra_dma0, ra27
++
++# Loop: destination address
++.set ra_dest, ra28
++
++# Setup: Dup of rb_ef
++# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
++# (top bits are ignored by mul24)
++.set ra_ef, ra29
++
++# Use an even numbered register as a link register to avoid corrupting flags
++.set ra_link, ra30
++
++# -- free -- ra31
++
++.set rb_xshift2, rb0
++.set rb_xshift2_next, rb1
++
++# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
++.set rb_elem_x, rb2
++
++# El Flags
++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
++# Duped into ra_ef as sometimes that is easier to use
++.set rb_ef, rb3
++
++# rb4-11
++# Loop: V filter FIFO or V filter coeff
++
++# Loop var: offset to add before shift (round + weighting offsets)
++# Exact value varies by loop
++.set rb_wt_off, rb12
++
++# -- free -- rb13
++
++# -- free -- rb14
++
++# Loop: src frame base (L1)
++.set rb_base2, rb15
++
++# Line pitch (128 for sand128)
++.set rb_pitch, rb16
++
++# Loop count - 2 (set up TMU for next xfer)
++.set rb_i_tmu, rb17
++
++# Loop count for min(height, 16)
++# Y will reset & loop again if height > 16
++.set rb_lcount, rb18
++
++# frame_base2_next
++.set rb_base2_next, rb19
++
++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
++# offset to the slice
++.set rb_xpitch, rb20
++
++# These 3 consts each save 1 instruction in Y loop setup
++# so whilst they are worthwhile they should be the 1st to die if we need
++# another b reg
++.set rb_y_coeffs_2, rb21 # 0x050b0a00
++.set rb_y_coeffs_3, rb22 # 0x11283a40
++.set rb_y_coeffs_5, rb23 # 0x0a0b0500
++
++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
++.set rb_pmask, rb24
++
++# vdw_setup_1(dst_pitch)
++.set rb_dma1_base, rb25
++
++# Setup: pic width - 1
++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
++.set rb_max_x, rb26
++
++# vdw_setup_0 (depends on QPU number)
++.set rb_dma0_base, rb27
++
++# Setup: vw_setup value to reset VPM write pointer
++.set rb_vpm_init, rb28
++
++# Loop: vdw_setup_1(dst_pitch-width) = stride
++.set rb_dma1, rb29
++
++# Setup: pic_height - 1
++.set rb_max_y, rb30
++
++# Setup: FIR H offset
++.set rb_fir_off_h, rb31
++
++
++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
++.set i_shift16, -16
++.set i_shift21, -11
++.set i_shift23, -9
++.set i_shift30, -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
++ mov r2, qpu_num
++.if v_bit_depth <= 8
++ # 8 bit version
++ asr r1, r2, 2
++ shl r1, r1, 6
++ and r0, r2, 3
++ or r0, r0, r1
++
++ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++ add r_vpm, r0, r1 # VPM 8bit storage
++
++ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++ shl r0, r0, 5
++
++.else
++ # 16 bit version
++ # Limited to 8 QPUs if blk height > 8
++ asr r1, r2, 1
++.if v_blk_height <= 8
++ shl r1, r1, 4
++.else
++ shl r1, r1, 5
++.endif
++ and r0, r2, 1
++ or r0, r0, r1
++
++ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR
++ add r_vpm, r0, r1
++
++ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
++ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
++ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
++ shl r0, r0, 6
++.endif
++ add r_dma, r0, r1 # DMA out
++.endm
++
++
++.macro m_setup_q0
++ srel -, 12
++.endm
++
++# Code start label
++::mc_start
++
++################################################################################
++# mc_setup_c
++#
++# typedef struct qpu_mc_pred_c_s_s {
++# int16_t y;
++# int16_t x;
++# uint32_t base;
++# uint32_t pic_cw; // C Width (== Y width / 2)
++# uint32_t pic_ch; // C Height (== Y Height / 2)
++# uint32_t stride2;
++# uint32_t stride1;
++# uint32_t wdenom;
++# int16_t y2;
++# int16_t x2;
++# uint32_t base2;
++# uint32_t next_fn;
++# } qpu_mc_pred_c_s_t;
++
++.macro m_setup_c, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift, 1
++.set v_pmask, 0xff
++.set v_blk_height, C_BLK_HEIGHT_8
++.else
++.set v_x_shift, 2
++.set v_pmask, 0xffff
++.set v_blk_height, C_BLK_HEIGHT_16
++.endif
++
++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y
++
++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++ shl rb_ef, r0, i_shift30 ; mov ra_base, unif # ; ref_c_base
++
++# Read image dimensions
++ sub r0, unif, 1 # pic c width
++ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes
++ sub rb_max_y, unif, 1 # pic c height
++
++# load constants
++ mov ra_kff800100, 0xff800100
++ mov rb_pmask, v_pmask
++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++
++# get source pitch
++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # ; stride2
++ mov rb_pitch, unif # stride1
++ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly
++ add rb_dma1_base, r1, rb_pitch # vdw_setup_1
++
++ and r0, 1, elem_num
++ nop ; mul24 r0, r0, 5
++.if v_bit_depth <= 8
++ add rb_elem_x, r0, elem_num
++.else
++ add r0, r0, elem_num
++ add rb_elem_x, r0, r0
++.endif
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay]
++ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice
++ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y
++ min r0, r0, rb_max_x
++
++# Get shift
++# Shift will always calculate as 0 for 9+ bit
++# Ideally we can optimize the shift out of the code in these cases but for now
++# it is tidier to leave it in
++.if v_bit_depth <= 8
++ shl ra_xshift_next, r0, 3
++.else
++ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++.endif
++
++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
++
++.if v_bit_depth <= 8
++ and r0, r0, -4
++.endif
++ sub r1, ra_k0, rb_pitch
++ and r1, r0, r1
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra0, unif # ; next_x2_y2
++ add ra_base, ra_base, r0
++
++# Compute part of VPM to use for DMA output
++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++# And again for L1, but only worrying about frame2 stuff
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# rb_base2 ends up with t1s base
++
++ shl r0, ra0.16b, v_x_shift
++ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset
++ max r0, r0, 0 ; mov rb_base2, unif # ref_c_base2
++ min r0, r0, rb_max_x
++
++# Get shift (already zero if 9+ bit so ignore)
++.if v_bit_depth <= 8
++ shl rb_xshift2_next, r0, 3
++.endif
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++.if v_bit_depth <= 8
++ and r0, r0, -4
++.endif
++ sub r1, ra_k0, rb_pitch
++ and r1, r0, r1 ; mov r3, PREREAD
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov r2, ra_y2
++ add rb_base2, rb_base2, r0 ; mov r0, ra_y
++
++# Do preloads
++# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
++
++:1
++ sub.setf r3, r3, 1
++ max r1, r0, 0
++ min r1, r1, rb_max_y
++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t0s, ra_base, r1 ; mov ra_y, r0
++
++ max r1, r2, 0
++ brr.anynz -, r:1b
++ min r1, r1, rb_max_y
++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t1s, rb_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
++
++ mov ra_link, unif # link
++# touch registers to keep simulator happy (and fills in delay slots)
++ mov ra4, 0 ; mov rb4, 0
++ bra -, ra_link
++ mov ra5, 0 ; mov rb5, 0
++ mov ra6, 0 ; mov rb6, 0
++ mov ra7, 0 ; mov rb7, 0
++# >>> ra_link
++.endm
++
++::mc_setup_c_q0
++ m_setup_q0
++::mc_setup_c_qn
++ m_setup_c 8
++
++################################################################################
++#
++# mc_filter_c_p
++#
++# typedef struct qpu_mc_pred_c_p_s {
++# int16_t y;
++# int16_t x;
++# uint32_t base;
++# uint16_t h;
++# uint16_t w;
++# uint32_t coeffs_x;
++# uint32_t coeffs_y;
++# uint32_t wo_u;
++# uint32_t wo_v;
++# uint32_t dst_addr_c;
++# uint32_t next_fn;
++# } qpu_mc_pred_c_p_t;
++
++.macro m_filter_c_p, v_tmu, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 1
++.set v_x_mul, 2
++.set v_v_shift, 8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 2
++.set v_x_mul, 4
++.set v_v_shift, i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++
++.if v_tmu == 0
++.set vrx_xshift, rb_xshift2 # b side more convienient
++.set vrx_xshift_next, ra_xshift_next
++.set vra_y_next, ra_y_next
++.set vrx_base_next, ra_base_next
++.set vra_y, ra_y
++.set vra_base, ra_base
++.set vr_txs, t0s
++.else
++.set vrx_xshift, ra_xshift # a side more convienient
++.set vrx_xshift_next, rb_xshift2_next
++.set vra_y_next, ra_y2_next
++.set vrx_base_next, rb_base2_next
++.set vra_y, ra_y2
++.set vra_base, rb_base2
++.set vr_txs, t1s
++.endif
++
++# denom shift values
++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
++
++# per-channel shifts were calculated on the *previous* invocation
++# get base addresses and per-channel shifts for *next* invocation
++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
++
++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base
++
++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
++ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs
++ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++
++.if v_bit_depth <= 8
++ shl vrx_xshift_next, r0, 3
++ and r0, r0, -4
++.endif
++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced!
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs
++ add vrx_base_next, r3, r0 ; mov r1, ra_height
++
++# set up VPM write
++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
++
++# Misc final setup...
++
++ shl r0, r1, v_dma_h_shift ; mov ra_dest, unif # ; dst_addr
++ add r0, r0, r2 ; mov r2, ra_fir_off_val # Combine width and height of destination area (r0=h<<8, r2=w*2)
++ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register
++ add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight
++ shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
++ sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
++ add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 # ; loop counter (V FIFO fill = 4)
++ mov rb11, ra3.8d ; mov ra_link, unif # ; Link
++
++# r5 = -4 (loop counter)
++# ra_wt_mul_l0 = weight L0 + 128 (now unsigned)
++# rb_wt_off = (offset * 2 + 1) << (wt_den + 5)
++# rb31 = FIR value offset
++
++# FIFO: rb4, ra5, rb6, ra7
++# Coeffs in ra3.8a, ra3.8b, rb10, rb11
++
++# We want (r0r1)
++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
++# We fetch (after shift)
++# C0 : C3 : C1 : C4 : C2 : C5 : ...
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++.if v_tmu == 0
++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0
++ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++.else
++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1
++ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next # [r1 << delay]
++.endif
++
++ add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++ min r3, r3, rb_max_y ; mov.ifnc r0, r2
++
++ and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
++.if v_tmu == 0
++ add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask # ; mask bytes
++.else
++ add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax # ; mask bytes
++.endif
++
++# apply horizontal filter
++# The filter coeffs for the two halves of this are the same (unlike in the
++# Y case) so it doesn't matter which ra0 we get them from
++# Also as the two halves are locked together we don't need to separate the 1st
++# r0 mul or the last r1 mul as they are valid for all QPUs
++
++ add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
++ sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
++ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++ add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++
++# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
++# We would like to save the r5->r4 shift but we need a delay slot
++# for both r7 & r6 which we can't find anything to put in if we have
++# already multiplied r4 & r5!
++ brr.anyn -, r:1b
++ add r2, r2, r3 ; mul24 r0, ra7, rb10 # r6 post
++ mov ra5, rb6 ; mul24 r1, rb6, ra3.8b # r5 post
++ asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
++# >>> .anyn 1b
++
++ add r1, r1, r0 ; mul24 r0, rb4, ra3.8a # [ra7 delay]
++ sub r1, r1, r0 ; mul24 r0, ra7, rb11
++ sub r1, r1, r0
++
++ asr r1, r1, 6 ; mov r3, ra_blk_height # ; NxtLoop
++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop
++ brr.anyn -, r:1b
++ asr r1, r1, i_wt_den_p6
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_p
++ m_filter_c_p 0, 8
++
++::mc_filter_c_p_l1
++ m_filter_c_p 1, 8
++
++################################################################################
++#
++# mc_filter_c_b
++#
++# typedef struct qpu_mc_pred_c_b_s {
++# int16_t y;
++# int16_t x;
++# uint32_t base;
++# uint16_t h;
++# uint16_t w;
++# uint32_t coeffs_x1;
++# uint32_t coeffs_y1;
++# int16_t weight_u1;
++# int16_t weight_v1;
++# int16_t y2;
++# int16_t x2;
++# uint32_t base2;
++# uint32_t coeffs_x2;
++# uint32_t coeffs_y2;
++# uint32_t wo_u2;
++# uint32_t wo_v2;
++# uint32_t dst_addr_c;
++# uint32_t next_fn;
++# } qpu_mc_pred_c_b_t;
++
++.macro m_filter_c_b, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 1
++.set v_v_shift, 8
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 2
++.set v_v_shift, i_shift16
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++.set v_x_mul, (1 << v_x_shift)
++
++# denom shift values
++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
++
++# per-channel shifts were calculated on the *previous* invocation
++
++# get base addresses and per-channel shifts for *next* invocation
++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
++
++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base
++
++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0
++ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++ min r0, r0, rb_max_x ; mov ra0, unif # ; L0 H filter coeffs
++
++.if v_bit_depth <= 8
++ shl ra_xshift_next, r0, 3
++.endif
++
++ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs
++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs)
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height
++ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++
++# set up VPM write
++
++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
++
++ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2
++ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base
++ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register
++ add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b # r0=x
++
++# L1 - uniform layout could possibly be optimized
++
++ shl r0, r0, v_x_shift ; mov ra1, unif # r0=x<<shift ; L1 H filter coeffs
++ add r0, r0, rb_elem_x ; mov ra3, unif # ; L1 V filter coeffs
++ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
++ max r0, r0, r5 ; mov ra9, rb_max_y
++ min r0, r0, rb_max_x ; mov r2, ra_kmul_add
++
++.if v_bit_depth <= 8
++ shl rb_xshift2_next, r0, 3
++.endif
++
++ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
++ and r1, r0, r1 ; mov r5rep, -4
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dst_addr
++ add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val
++
++ add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++ add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++ add r0, r0, r1 ; mov r1, ra_wt_off_l1 # ; L0 off unset
++ shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d
++ sub rb_wt_off, r1, r0 ; mov ra_link, unif # ; link
++
++ mov ra10, rb_xshift2 ; mov rb7, ra2.8d
++
++# r5 loop counter (-4)
++# ra0 H coeffs L0
++# ra1 H coeffs L1
++# ra2 V coeffs L0
++# ra3 V coeffs L1
++# ra9 rb_max_y alias
++# ra10 rb_xshift2 alias
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0
++ shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next
++ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next # [ra_y delay]
++ add ra_y, 1, ra_y ; mov r3, ra_y
++
++ max r3, r3, ra_k0 ; mov r0, r1 << 15
++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++
++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # ; masks bytes
++
++# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
++
++ and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0
++ sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1
++ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++
++ add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1
++
++ shr r2, r4, ra10 ; mov rb5, rb6
++ shr r1, r2, v_v_shift ; mov r3, ra_y2
++ shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 # [r1 << delay]
++
++ add ra_y2, r3, ra_k1 ; mov r0, r1 << 15
++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++ min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax
++
++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++ add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax # ; masks bytes
++
++# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
++
++ add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0
++ sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1
++ sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++ add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++
++ brr.anyn -, r:1b
++ add r2, r2, r3 ; mul24 r0, rb9, ra3.8a
++ mov rb9, rb10 ; mul24 r1, rb10, ra3.8b
++ shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++# >>> .anyn 1b
++
++ sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b # L1 ; L0
++ sub.setf -, r5, rb_lcount ; mov r0, ra4
++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++ add r1, r1, r0 ; mul24 r0, ra7, rb7
++
++ sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c # L1
++ add r2, r2, r0 ; mul24 r0, ra11, rb11 # L1
++ sub r2, r2, r0
++
++ shr r1, r1, 6
++ shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0
++ add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1
++ add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add
++ sub r1, r1, r2 ; mov r3, ra_blk_height # ; NxtLoop
++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 # ; NxtLoop
++
++ brr.anyn -, r:1b
++ asr r1, r1, ra_wt_den_p7
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_b
++ m_filter_c_b 8
++
++################################################################################
++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
++# conflicts
++
++.macro m_exit_drain
++.if PREREAD == 2
++# Special case 2 as loop is wasteful
++ nop ; nop ; ldtmu0
++ nop ; nop ; ldtmu1
++ nop ; nop ; ldtmu0
++ mov -, vw_wait ; nop ; ldtmu1
++.else
++ mov.setf r3, PREREAD - 1
++:1
++ brr.anynz -, r:1b
++ nop ; nop ; ldtmu0
++ nop ; nop ; ldtmu1
++ sub.setf r3, r3, 1
++ # >>>
++ mov -, vw_wait
++.endif
++.endm
++
++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
++# All qpus start at the beginning and after that (group - 1) must have finished
++# before (group) can start
++#
++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
++# lockup otherwise)
++#
++# There is some, currently ill defined, potential lockup if we have the VDM active
++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
++#
++# The code stalled when I had many waiters on a single sem so we have a
++# "ripple" of srels to restart. Unsure why, may have been bug, but this works
++# and we currently have both the memory & sems to support it.
++.macro m_sync_q, n_qpu, n_quads
++# Do not generate code for qpu >= quads * 4 - fns should never be called
++.if n_qpu < n_quads * 4
++ mov ra_link, unif # Can only branch to an a reg (not r0)
++ mov -, vw_wait # [ra_link delay]
++
++.set n_sem_sync, n_qpu - (n_qpu % 4)
++.set n_sem_in, n_qpu
++.set n_sem_out, n_qpu + 1
++
++.if n_qpu % 4 == 0
++
++.set n_sem_quad_in, 12 + n_qpu / 4
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++
++ sacq -, n_sem_sync
++ sacq -, n_sem_sync
++ sacq -, n_sem_sync
++ bra -, ra_link
++ sacq -, n_sem_quad_in
++ srel -, n_sem_out
++ srel -, n_sem_quad_out
++
++.else
++ bra -, ra_link
++ srel -, n_sem_sync
++ sacq -, n_sem_in
++.if n_sem_out % 4 != 0
++ srel -, n_sem_out
++.else
++ nop
++.endif
++.endif
++.endif
++.endm
++
++.set v_quads8, N_QPU_8 / 4
++
++::mc_sync_q0
++ m_sync_q 0, v_quads8
++::mc_sync_q1
++ m_sync_q 1, v_quads8
++::mc_sync_q2
++ m_sync_q 2, v_quads8
++::mc_sync_q3
++ m_sync_q 3, v_quads8
++::mc_sync_q4
++ m_sync_q 4, v_quads8
++::mc_sync_q5
++ m_sync_q 5, v_quads8
++::mc_sync_q6
++ m_sync_q 6, v_quads8
++::mc_sync_q7
++ m_sync_q 7, v_quads8
++::mc_sync_q8
++ m_sync_q 8, v_quads8
++::mc_sync_q9
++ m_sync_q 9, v_quads8
++::mc_sync_q10
++ m_sync_q 10, v_quads8
++::mc_sync_q11
++ m_sync_q 11, v_quads8
++
++# mc_exit()
++# Chroma & Luma the same now
++
++.macro m_exit_qn
++ m_exit_drain
++ nop ; nop ; thrend
++ nop
++ nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_qn
++::mc_exit_y_qn
++ m_exit_qn
++
++
++
++# mc_interrupt_exit12()
++
++.macro m_exit_q0
++ m_exit_drain
++ sacq -, 12
++ nop ; nop ; thrend
++ mov interrupt, 1
++ nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_q0
++::mc_exit_y_q0
++ m_exit_q0
++
++# LUMA CODE
++
++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
++# For P frames we make the second x,y coordinates offset by +8
++
++
++################################################################################
++# mc_setup
++#
++# typedef struct qpu_mc_pred_y_s_s {
++# qpu_mc_src_t next_src1;
++# qpu_mc_src_t next_src2;
++# uint16_t pic_h;
++# uint16_t pic_w;
++# uint32_t stride2;
++# uint32_t stride1;
++# uint32_t wdenom;
++# uint32_t next_fn;
++# } qpu_mc_pred_y_s_t;
++
++.macro m_setup_y, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift, 0
++.set v_pmask, 0xff
++.set v_blk_height, Y_BLK_HEIGHT_8
++.else
++.set v_x_shift, 1
++.set v_pmask, 0xffff
++.set v_blk_height, Y_BLK_HEIGHT_16
++.endif
++
++
++ # Need to save these because we need to know the frame dimensions before computing texture coordinates
++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y
++ mov ra9, unif # ref_y_base
++ mov ra1, unif # x2_y2
++
++
++# load constants
++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++ shl rb_ef, r0, i_shift30 ; mov ra11, unif # ; ref_y2_base
++
++ mov ra_kff800100, 0xff800100
++ mov rb_pmask, v_pmask
++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++ mov rb_y_coeffs_2, 0x050b0a00
++ mov rb_y_coeffs_3, 0x11283a40
++ mov rb_y_coeffs_5, 0x0a0b0500
++
++# Compute part of VPM to use
++
++# Read image dimensions
++ mov ra3, unif # width_height
++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # [ra3 delay] ; stride2
++.if v_x_shift == 0
++ sub rb_max_x, ra3.16b, 1
++.else
++ sub r0, ra3.16b, 1
++ shl rb_max_x, r0, v_x_shift
++.endif
++ sub rb_max_y, ra3.16a, 1
++ mov r3, elem_num ; mov rb_pitch, unif # stride1
++
++# get destination pitch
++ mov r1, vdw_setup_1(0) # [rb_pitch delay]
++ or rb_dma1_base, r1, rb_pitch
++
++# Compute base address for first and second access
++ add r0, ra0.16b, r3 # Load x + elem_num
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, 0
++ min r0, r0, rb_max_x
++ shl ra_xshift_next, r0, 3 # Compute shifts
++
++# X is byte offset - we can only load words - mask
++
++ and r0, r0, -4 ; v8subs r2, r2, r2
++ sub r2, r2, rb_pitch
++ and r1, r0, r2
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 # Add stripe offsets
++ add ra_base, ra9, r0
++
++ # r3 still contains elem_num
++ add r0, ra1.16b, r3 # Load x
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, 0
++ min r0, r0, rb_max_x
++ shl rb_xshift2_next, r0, 3 # Compute shifts
++
++ # r2 still contains mask
++ and r0, r0, -4
++ and r1, r0, r2
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 # Add stripe offsets
++ add rb_base2, ra11, r0
++
++# Do preloads
++ nop ; mov r0, ra0.16a # ; r0 = y
++ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2
++
++:1
++ sub.setf r3, r3, 1
++ max r1, r0, 0
++ min r1, r1, rb_max_y
++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t0s, ra_base, r1 ; mov ra_y, r0
++
++ max r1, r2, 0
++ brr.anynz -, r:1b
++ min r1, r1, rb_max_y
++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t1s, rb_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
++
++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++ mov ra_link, unif # Next fn
++
++# touch vertical context to keep simulator happy
++ mov ra8, 0 ; mov rb8, 0 # [ra_link delay]
++ bra -, ra_link
++ mov ra9, 0 ; mov rb9, 0
++ mov ra10, 0 ; mov rb10, 0
++ mov ra11, 0 ; mov rb11, 0
++# >>> ra_link
++.endm
++
++::mc_setup_y_q0
++ m_setup_q0
++::mc_setup_y_qn
++ m_setup_y 8
++
++################################################################################
++#
++# Start of per-block setup code
++# P and B blocks share the same setup code to save on Icache space
++
++# get base addresses and per-channel shifts for *next* invocation
++# per-channel shifts were calculated on the *previous* invocation
++
++# 1st 3 instructions of per_block-setup in branch delay
++#
++# typedef struct qpu_mc_pred_y_p_s {
++# qpu_mc_src_t next_src1;
++# qpu_mc_src_t next_src2;
++# uint16_t h;
++# uint16_t w;
++# uint32_t mymx21;
++# uint32_t wo1;
++# uint32_t wo2;
++# uint32_t dst_addr;
++# uint32_t next_fn;
++# } qpu_mc_pred_y_p_t;
++#
++
++.macro m_luma_setup, v_bit_depth
++# Hack - QASM may well have have label pasting but I have no idea how...
++.if v_bit_depth == 8
++ brr ra_link, r:per_block_setup_8
++.elif v_bit_depth == 10
++ brr ra_link, r:per_block_setup_10
++.endif
++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack??
++ add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
++ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++.endm
++
++.macro m_per_block_setup, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 0
++.set v_x_mul, 1
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 1
++.set v_x_mul, 2
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++ min r0, r0, rb_max_x
++
++ shl ra_xshift_next, r0, 3 # Compute shifts
++ and r0, r0, -4
++ sub r2, r5, rb_pitch ; mov ra_base_next, unif # ; src1.base
++ and r1, r0, r2 ; mov ra_y_next, ra0.16a
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y
++ add ra_base_next, ra_base_next, r0 # [ra1 delay]
++
++ add r0, ra1.16b, r3 # Load x2
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, r5 ; mov ra_y2_next, ra1.16a
++ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base
++ shl rb_xshift2_next, r0, 3 # Compute shifts
++ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height
++ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
++ add rb_base2_next, rb_base2_next, r0
++
++# get width,height of block (unif load above), r1 = width * pel_size
++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
++ add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++ add rb_lcount, r0, (7-8)
++ shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add # ; r3 return val
++ add r0, r0, r1 # Combine width and height of destination area
++ shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
++ add ra_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets
++
++# get filter coefficients and discard unused B frame values
++ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight
++ shl ra8, r0, 3 ; mov rb5, ra_k255
++
++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
++
++# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
++# but I can't see a way of doing that that is cheap enough to be worth it
++
++# Picked out in a slightly random order to space out uniform loads
++
++ # 1
++ mov r1, 0x01040400 # [ra8 delay]
++ ror ra2.8b, r1, ra8.8d
++ ror ra0.8b, r1, ra8.8c
++ # 2
++ ror ra2.8c, rb_y_coeffs_2, ra8.8d
++ ror ra0.8c, rb_y_coeffs_2, ra8.8c
++ # 0
++ mov r1,0x00010100 # -ve [ra8 delay]
++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif # ; L1 Wt/Offset
++ ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5
++ # 7
++ shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
++ ror r0, r1, ra8.8d ; mov ra_dest, unif # ; Destination address
++ ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5
++ # 3
++ ror ra2.8d, rb_y_coeffs_3, ra8.8d
++ ror ra0.8d, rb_y_coeffs_3, ra8.8c
++ # 5
++ ror ra3.8b, rb_y_coeffs_5, ra8.8d
++ ror ra1.8b, rb_y_coeffs_5, ra8.8c
++ # 6
++ mov r1,0x04040100
++ ror ra3.8c, r1, ra8.8d
++ ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 # ; r5 return val
++
++ bra -, ra_link
++ # 4
++ mov r1,0x3a281100
++ ror r0, r1, ra8.8d ; mov ra_link, unif # ; link - load after we've used its previous val
++ ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5
++# >>> branch ra_link
++
++# r5 = -8
++# r2 = fir_off_val
++# r3 = 128
++.endm
++
++:per_block_setup_8
++ m_per_block_setup 8
++
++
++
++################################################################################
++#
++# mc_filter_y_pxx
++#
++# Setup (& therefore uniform struct) shared with _bxx
++# Struct in m_luma_setup
++#
++# We can have 2 separate P reqs here as long as they mate to generate a
++# rectangular output block (i.e. h0 = h1, w0 = 8)
++#
++# At this point we have already issued PREREAD pairs of texture requests for the current block
++
++.macro m_filter_y_pxx, v_bit_depth
++
++# denom shift values
++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
++
++ m_luma_setup v_bit_depth
++
++ shl r1, ra_wt_off_l0, i_wt_den_p5
++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# This loop is identical to the B loop from here --->
++:1
++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++
++ max r2, ra_y, 0 ; mov r1, 0
++ min r2, r2, rb_max_y ; mov r3, ra_k1
++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++ add t0s, ra_base, r2 ; mov rb5, rb6
++ shr r0, r4, ra_xshift ; mov rb6, rb7
++
++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes
++ shr r1, r4, rb_xshift2 ; mov rb7, ra8
++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++ add t1s, rb_base2, r2 ; mov ra8, ra9
++
++# apply horizontal filter
++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++ brr.anyn -, r:1b
++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++ # >>> .anyn 1b (r5 + r5)
++
++ # apply vertical filter and write to VPM
++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
++
++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++ add r1, r1, r0 ; mul24 r0, ra8, rb8
++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++ add r1, r1, r0 ; mul24 r0, ra11, rb11
++# <--- to here
++ sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height
++ sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next
++ sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next
++
++ asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next
++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate)
++
++ brr.anyn -, r:1b
++ asr r1, r1, i_wt_den_p6
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop
++# >>> branch.anyn 1b (r5 - rb_lcount)
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_pxx
++ m_filter_y_pxx 8
++
++
++################################################################################
++
++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++#
++# Setup (& therefore uniform struct) shared with _pxx
++# Struct in m_luma_setup
++#
++# l0 calc in els 0-7, L1 in 8-15
++# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
++#
++# At this point we have already issued PREREAD pairs of texture requests for the current block
++
++.macro m_filter_y_bxx, v_bit_depth
++
++# denom shift values
++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
++
++ m_luma_setup v_bit_depth
++
++ shl r1, ra_wt_off_l0, i_wt_den_p6
++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++ sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1
++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
++
++# This loop is identical to the P loop from here --->
++:1
++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
++
++ max r2, ra_y, 0 ; mov r1, 0
++ min r2, r2, rb_max_y ; mov r3, ra_k1
++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
++ add t0s, ra_base, r2 ; mov rb5, rb6
++ shr r0, r4, ra_xshift ; mov rb6, rb7
++
++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes
++ shr r1, r4, rb_xshift2 ; mov rb7, ra8
++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
++ add t1s, rb_base2, r2 ; mov ra8, ra9
++
++# apply horizontal filter
++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++ brr.anyn -, r:1b
++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++ # >>> .anyn 1b (r5 + r5)
++
++ # apply vertical filter and write to VPM
++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
++
++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++ add r1, r1, r0 ; mul24 r0, ra8, rb8
++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
++ add r1, r1, r0 ; mul24 r0, ra11, rb11
++# <--- to here
++ sub r1, r1, ra4
++ sub r1, r1, r0 ; mov r2, rb_wt_off
++
++ asr r1, r1, 6
++ sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0
++ mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++ sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next
++ sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next
++ add r1, r1, r2 ; mov r0, r1 << 8
++ add r1, r1, r0 ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height
++
++ brr.anyn -, r:1b
++ asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch # ; NxtLoop
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, 0 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate)
++# >>> branch.anyn 1b (r5 - rb_lcount)
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed block_height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link (ra_height - remaining height)
++
++# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_bxx
++ m_filter_y_bxx 8
++
++################################################################################
++#
++# typedef struct qpu_mc_pred_y_p00_s {
++# qpu_mc_src_t next_src1;
++# uint16_t h;
++# uint16_t w;
++# uint32_t wo1;
++# uint32_t dst_addr;
++# uint32_t next_fn;
++# } qpu_mc_pred_y_p00_t;
++
++.macro m_filter_y_p00, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 0
++.set v_x_mul, 1
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 1
++.set v_x_mul, 2
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++
++ mov ra0, unif ; mov r0, elem_num # y_x
++ mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 # [ra0 delay] ; r5 = 0
++ add r0, ra0.16b, r0 ; mov ra_base_next, unif # ; src1.base
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++
++ max r0, r0, r5 ; mov ra_y_next, ra0.16a # ; width_height
++ min r0, r0, rb_max_x ; mov ra_width_height, unif
++
++ shl ra_xshift_next, r0, 3 # Compute shifts
++ and r0, r0, -4
++ sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif # ; weight_offset
++ and r1, r0, r2
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dest addr
++ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write
++
++# get width,height of block (unif load above)
++# Compute vdw_setup1(dst_pitch-width)
++ shl r1, ra_width, v_x_shift
++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++ add r0, r0, r1 # Combine width and height of destination area
++ shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++ shl r0, r0, v_dma_wh_shift ; mov ra_link, unif # Shift into bits 16 upwards of the vdw_setup0 register ; link
++ add ra_dma0, r0, rb_dma0_base
++
++:1
++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
++ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
++ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++ max r2, ra_y, 0 # y
++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
++
++ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++ shl r1, r1, 8 ; mov r3, ra_blk_height
++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++ brr.anyn -, r:1b
++ asr r1, r1, DENOM + 8
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_p00
++ m_filter_y_p00 8
++
++################################################################################
++
++.macro m_filter_y_b00, v_bit_depth
++# luma setup does a fair bit more than we need calculating filter coeffs
++# that we will never use but it saves I-cache to use it (also simple!)
++ m_luma_setup v_bit_depth
++
++# Fix up vals that were expecting a filter (somewhat icky)
++ mov r2, 1
++ add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 # Need in rX rather than raX for <<8 to do what we want
++ shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
++ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++
++:1
++ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1
++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++ max r2, ra_y, 0 # y
++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++ add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next
++
++ max r2, ra_y2, 0
++ min r2, r2, rb_max_y
++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++ add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax # v8subs masks out all but bottom byte
++ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
++
++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++ add r1, r0, r1 ; v8adds r5rep, r5, ra_k1
++
++ shl r1, r1, 8 ; mov r3, ra_blk_height
++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++ brr.anyn -, r:1b
++ asr r1, r1, (DENOM + 9) - 32 # -32 to get valid shift immediate
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride
++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ brr -, r:1b
++ add rb_lcount, rb_lcount, r0
++ add ra_dma0, ra_dma0, r1
++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_b00
++ m_filter_y_b00 8
++
++################################################################################
++################################################################################
++# 10 BIT
++
++::mc_setup_c10_q0
++ m_setup_q0
++::mc_setup_c10_qn
++ m_setup_c 10
++
++::mc_filter_c10_p
++ m_filter_c_p 0, 10
++
++::mc_filter_c10_p_l1
++ m_filter_c_p 1, 10
++
++
++::mc_filter_c10_b
++ m_filter_c_b 10
++
++# Even if these fns are the same as for other bit depths we want our own copy
++# to keep the code we are using in a single lump to avoid (direct map) cache
++# thrashing
++.set v_quads10, N_QPU_16 / 4
++
++::mc_sync10_q0
++ m_sync_q 0, v_quads10
++::mc_sync10_q1
++ m_sync_q 1, v_quads10
++::mc_sync10_q2
++ m_sync_q 2, v_quads10
++::mc_sync10_q3
++ m_sync_q 3, v_quads10
++::mc_sync10_q4
++ m_sync_q 4, v_quads10
++::mc_sync10_q5
++ m_sync_q 5, v_quads10
++::mc_sync10_q6
++ m_sync_q 6, v_quads10
++::mc_sync10_q7
++ m_sync_q 7, v_quads10
++::mc_sync10_q8
++ m_sync_q 8, v_quads10
++::mc_sync10_q9
++ m_sync_q 9, v_quads10
++::mc_sync10_q10
++ m_sync_q 10, v_quads10
++::mc_sync10_q11
++ m_sync_q 11, v_quads10
++
++::mc_exit_y10_q0
++::mc_exit_c10_q0
++ m_exit_q0
++
++::mc_exit_y10_qn
++::mc_exit_c10_qn
++ m_exit_qn
++
++::mc_setup_y10_q0
++ m_setup_q0
++::mc_setup_y10_qn
++ m_setup_y 10
++
++:per_block_setup_10
++ m_per_block_setup 10
++
++::mc_filter_y10_pxx
++ m_filter_y_pxx 10
++
++::mc_filter_y10_p00
++ m_filter_y_p00 10
++
++::mc_filter_y10_bxx
++ m_filter_y_bxx 10
++
++::mc_filter_y10_b00
++ m_filter_y_b00 10
++
++
++
++::mc_end
++# Do not add code here because mc_end must appear after all other code.
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_cmd.h
+@@ -0,0 +1,165 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#ifndef RPI_SHADER_CMD_H
++#define RPI_SHADER_CMD_H
++
++#pragma pack(push, 4)
++
++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
++// If mixed then we are just confused and get a lot of warnings....
++typedef const uint8_t * qpu_mc_src_addr_t;
++typedef uint8_t * qpu_mc_dst_addr_t;
++#else
++typedef uint32_t qpu_mc_src_addr_t;
++typedef uint32_t qpu_mc_dst_addr_t;
++#endif
++
++typedef struct qpu_mc_src_s
++{
++ int16_t y;
++ int16_t x;
++ qpu_mc_src_addr_t base;
++} qpu_mc_src_t;
++
++
++typedef struct qpu_mc_pred_c_p_s {
++ qpu_mc_src_t next_src;
++ uint16_t h;
++ uint16_t w;
++ uint32_t coeffs_x;
++ uint32_t coeffs_y;
++ uint32_t wo_u;
++ uint32_t wo_v;
++ qpu_mc_dst_addr_t dst_addr_c;
++ uint32_t next_fn;
++} qpu_mc_pred_c_p_t;
++
++typedef struct qpu_mc_pred_c_b_s {
++ qpu_mc_src_t next_src1;
++ uint16_t h;
++ uint16_t w;
++ uint32_t coeffs_x1;
++ uint32_t coeffs_y1;
++ int16_t weight_u1;
++ int16_t weight_v1;
++ qpu_mc_src_t next_src2;
++ uint32_t coeffs_x2;
++ uint32_t coeffs_y2;
++ uint32_t wo_u2;
++ uint32_t wo_v2;
++ qpu_mc_dst_addr_t dst_addr_c;
++ uint32_t next_fn;
++} qpu_mc_pred_c_b_t;
++
++typedef struct qpu_mc_pred_c_s_s {
++ qpu_mc_src_t next_src1;
++ uint32_t pic_cw; // C Width (== Y width / 2)
++ uint32_t pic_ch; // C Height (== Y Height / 2)
++ uint32_t stride2;
++ uint32_t stride1;
++ qpu_mc_src_t next_src2;
++ uint32_t next_fn;
++} qpu_mc_pred_c_s_t;
++
++typedef struct qpu_mc_pred_c_s {
++ union {
++ qpu_mc_pred_c_p_t p;
++ qpu_mc_pred_c_b_t b;
++ qpu_mc_pred_c_s_t s;
++ };
++} qpu_mc_pred_c_t;
++
++
++typedef struct qpu_mc_pred_y_p_s {
++ qpu_mc_src_t next_src1;
++ qpu_mc_src_t next_src2;
++ uint16_t h;
++ uint16_t w;
++ uint32_t mymx21;
++ uint32_t wo1;
++ uint32_t wo2;
++ qpu_mc_dst_addr_t dst_addr;
++ uint32_t next_fn;
++} qpu_mc_pred_y_p_t;
++
++typedef struct qpu_mc_pred_y_p00_s {
++ qpu_mc_src_t next_src1;
++ uint16_t h;
++ uint16_t w;
++ uint32_t wo1;
++ qpu_mc_dst_addr_t dst_addr;
++ uint32_t next_fn;
++} qpu_mc_pred_y_p00_t;
++
++typedef struct qpu_mc_pred_y_s_s {
++ qpu_mc_src_t next_src1;
++ qpu_mc_src_t next_src2;
++ uint16_t pic_h;
++ uint16_t pic_w;
++ uint32_t stride2;
++ uint32_t stride1;
++ uint32_t next_fn;
++} qpu_mc_pred_y_s_t;
++
++typedef struct qpu_mc_pred_sync_s {
++ uint32_t next_fn;
++} qpu_mc_pred_sync_t;
++
++// Only a useful structure in that it allows us to return something other than a void *
++typedef struct qpu_mc_pred_y_s {
++ union {
++ qpu_mc_pred_y_p_t p;
++ qpu_mc_pred_y_p00_t p00;
++ qpu_mc_pred_y_s_t s;
++ };
++} qpu_mc_pred_y_t;
++
++typedef union qpu_mc_pred_cmd_u {
++ qpu_mc_pred_y_t y;
++ qpu_mc_pred_c_t c;
++ qpu_mc_pred_sync_t sync;
++} qpu_mc_pred_cmd_t;
++
++static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn)
++{
++ // Link is last el of previous cmd
++ ((uint32_t *)cmd)[-1] = fn;
++}
++
++#define QPU_MC_PRED_N_Y8 12
++#define QPU_MC_PRED_N_C8 12
++
++#define QPU_MC_PRED_N_Y10 12
++#define QPU_MC_PRED_N_C10 12
++
++#define QPU_MC_DENOM 7
++
++#pragma pack(pop)
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.c
+@@ -0,0 +1,88 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++
++typedef struct shader_track_s
++{
++ const union qpu_mc_pred_cmd_u *qpu_mc_curr;
++ const struct qpu_mc_src_s *last_l0;
++ const struct qpu_mc_src_s *last_l1;
++ uint32_t width; // pic_width * PW
++ uint32_t height;
++ uint32_t stride2;
++ uint32_t stride1;
++} shader_track_t;
++
++static int wtoidx(const unsigned int w)
++{
++ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++ return pel_weight[w];
++}
++
++static const int fctom(uint32_t x)
++{
++ int rv;
++ // As it happens we can take the 2nd filter term & divide it by 8
++ // (dropping fractions) to get the fractional move
++ rv = 8 - ((x >> 11) & 0xf);
++ av_assert2(rv >= 0 && rv <= 7);
++ return rv;
++}
++
++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++{
++ return (x << shl) >> shr;
++}
++
++static inline int woff_p(HEVCRpiContext *const s, int32_t x)
++{
++ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int woff_b(HEVCRpiContext *const s, int32_t x)
++{
++ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int wweight(int32_t x)
++{
++ return ext(x, 16, 16);
++}
++
++
++#define PW 1
++#include "rpi_hevc_shader_template_fn.h"
++
++#undef PW
++#define PW 2
++#include "rpi_hevc_shader_template_fn.h"
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.h
+@@ -0,0 +1,49 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++
++struct HEVCRpiContext;
++struct HEVCRpiInterPredEnv;
++
++void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
++ const struct HEVCRpiInterPredEnv *const ipe_y,
++ const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
++ const struct HEVCRpiInterPredEnv *const ipe_y,
++ const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_sand_dump8(const char * const name,
++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++void rpi_sand_dump16(const char * const name,
++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template_fn.h
+@@ -0,0 +1,502 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++#define PATCH_STRIDE (16 * PW)
++
++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
++ const pixel s = *(const pixel *)src;
++ pixel * d = (pixel *)dst;
++ for (unsigned int j = 0; j < w; j += PW) {
++ *d++ = s;
++ }
++ }
++}
++
++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++ for (unsigned int i = 0; i != h; ++i, dst += stride) {
++ memcpy(dst, src, w);
++ }
++}
++
++static void FUNC(get_patch_y)(const shader_track_t * const st,
++ uint8_t * dst, const unsigned int dst_stride,
++ const qpu_mc_src_t *src,
++ unsigned int _w, unsigned int _h)
++{
++ int x = src->x * PW;
++ int y = src->y;
++ int w = _w * PW;
++ int h = _h;
++ int dl = 0;
++ int dr = 0;
++ int dt = 0;
++ int db = 0;
++
++ if (x < 0) {
++ if (-x >= w)
++ x = PW - w;
++ dl = -x;
++ w += x;
++ x = 0;
++ }
++ if (x + w > st->width) {
++ if (x >= st->width)
++ x = st->width - PW;
++ dr = (x + w) - st->width;
++ w = st->width - x;
++ }
++
++ // Y
++ if (y < 0) {
++ if (-y >= h)
++ y = 1 - h;
++ dt = -y;
++ h += y;
++ y = 0;
++ }
++ if (y + h > st->height) {
++ if (y >= st->height)
++ y = st->height - 1;
++ db = (y + h) - st->height;
++ h = st->height - y;
++ }
++
++ dst += dl + dt * dst_stride;
++ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++ // Edge dup
++ if (dl != 0)
++ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
++ if (dr != 0)
++ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
++ w += dl + dr;
++ dst -= dl;
++
++ if (dt != 0)
++ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
++ if (db != 0)
++ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
++}
++
++
++
++static void FUNC(get_patch_c)(const shader_track_t * const st,
++ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
++ const qpu_mc_src_t *src,
++ unsigned int _w, unsigned int _h)
++{
++ int x = src->x * PW;
++ int y = src->y;
++ int w = _w * PW;
++ int h = _h;
++ int dl = 0;
++ int dr = 0;
++ int dt = 0;
++ int db = 0;
++ const int width = st->width;
++ const int height = st->height;
++
++ if (x < 0) {
++ if (-x >= w)
++ x = PW - w;
++ dl = -x;
++ w += x;
++ x = 0;
++ }
++ if (x + w > width) {
++ if (x >= width)
++ x = width - PW;
++ dr = (x + w) - width;
++ w = width - x;
++ }
++
++ // Y
++ if (y < 0) {
++ if (-y >= h)
++ y = 1 - h;
++ dt = -y;
++ h += y;
++ y = 0;
++ }
++ if (y + h > height) {
++ if (y >= height)
++ y = height - 1;
++ db = (y + h) - height;
++ h = height - y;
++ }
++
++ dst_u += dl + dt * dst_stride;
++ dst_v += dl + dt * dst_stride;
++ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++ // Edge dup
++ if (dl != 0)
++ {
++ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
++ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
++ }
++ if (dr != 0)
++ {
++ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
++ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
++ }
++ w += dl + dr;
++ dst_u -= dl;
++ dst_v -= dl;
++
++ if (dt != 0)
++ {
++ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
++ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
++ }
++ if (db != 0)
++ {
++ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
++ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
++ }
++}
++
++// w, y, w, h in pixels
++// stride1, stride2 in bytes
++void FUNC(rpi_sand_dump)(const char * const name,
++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++{
++ const int mask = stride2 == 0 ? ~0 : stride1 - 1;
++
++ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++
++ if (is_c) {
++ x *= 2;
++ w *= 2;
++ }
++
++ for (int i = y; i != y + h; ++i) {
++ for (int j = x; j != x + w; ++j) {
++ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
++ char sep = is_c && (j & 1) == 0 ? ':' : ' ';
++#if PW == 1
++ if (j < 0 || i < 0)
++ printf("..%c", sep);
++ else
++ printf("%02x%c", *(const pixel*)p, sep);
++#else
++ if (j < 0 || i < 0)
++ printf("...%c", sep);
++ else
++ printf("%03x%c", *(const pixel*)p, sep);
++#endif
++ }
++ printf("\n");
++ }
++}
++
++
++void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
++ const HEVCRpiInterPredEnv *const ipe_y,
++ const HEVCRpiInterPredEnv *const ipe_c)
++{
++ for (int c_idx = 0; c_idx < 2; ++c_idx)
++ {
++ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
++ shader_track_t tracka[QPU_N_MAX] = {{NULL}};
++ unsigned int exit_n = 0;
++
++ if (ipe == NULL || !ipe->used) {
++ continue;
++ }
++
++ do {
++ for (unsigned int i = 0; i != ipe->n; ++i) {
++ const HEVCRpiInterPredQ * const q = ipe->q + i;
++ shader_track_t * const st = tracka + i;
++ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
++
++ for (;;) {
++ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++
++ if (link == q->code_setup) {
++ if (c_idx == 0) {
++ // Luma
++ const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++
++ st->height = c->pic_h;
++ st->width = c->pic_w * PW;
++ st->stride1 = c->stride1;
++ st->stride2 = c->stride2;
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else {
++ // Chroma
++ const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++
++ st->height = c->pic_ch;
++ st->width = c->pic_cw * PW;
++ st->stride1 = c->stride1;
++ st->stride2 = c->stride2;
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ }
++ else if (link == s->qpu.y_pxx) {
++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++ const int w1 = FFMIN(c->w, 8);
++ const int w2 = c->w - w1;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h + 7);
++ if (w2 > 0) {
++ FUNC(get_patch_y)(st,
++ patch_y2, PATCH_STRIDE,
++ st->last_l1,
++ 16, c->h + 7);
++ }
++
++ // wo[offset] = offset*2+1
++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
++ if (w2 > 0) {
++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
++ }
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.y_bxx) {
++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h + 7);
++ FUNC(get_patch_y)(st,
++ patch_y2, PATCH_STRIDE,
++ st->last_l1,
++ 16, c->h + 7);
++
++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++
++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
++ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.y_p00) {
++ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h + 7);
++
++ // wo[offset] = offset*2+1
++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++
++ st->last_l0 = &c->next_src1;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.y_b00) {
++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++ av_assert0(c->w <= 16 && c->h <= 64);
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h);
++ FUNC(get_patch_y)(st,
++ patch_y2, PATCH_STRIDE,
++ st->last_l1,
++ 16, c->h);
++
++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
++ patch_y3, patch_y1, PATCH_STRIDE,
++ c->h, 0, 0, c->w);
++
++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
++ 0, woff_b(s, c->wo2), 0, 0, c->w);
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.c_pxx) {
++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++ const int mx = fctom(c->coeffs_x);
++ const int my = fctom(c->coeffs_y);
++
++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_u3[8 * 16 * PW];
++ uint8_t patch_v3[8 * 16 * PW];
++
++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++ st->last_l0 = &c->next_src;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.c_pxx_l1) {
++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++ const int mx = fctom(c->coeffs_x);
++ const int my = fctom(c->coeffs_y);
++
++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_u3[8 * 16 * PW];
++ uint8_t patch_v3[8 * 16 * PW];
++
++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++ st->last_l1 = &c->next_src;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.c_bxx) {
++ const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
++ const int mx1 = fctom(c->coeffs_x1);
++ const int my1 = fctom(c->coeffs_y1);
++ const int mx2 = fctom(c->coeffs_x2);
++ const int my2 = fctom(c->coeffs_y2);
++
++ uint8_t patch_u1[PATCH_STRIDE * 72];
++ uint8_t patch_v1[PATCH_STRIDE * 72];
++ uint8_t patch_u2[PATCH_STRIDE * 72];
++ uint8_t patch_v2[PATCH_STRIDE * 72];
++ uint8_t patch_u3[8 * 16 * PW];
++ uint8_t patch_v3[8 * 16 * PW];
++ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
++ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++
++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, mx1, my1, c->w);
++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, mx1, my1, c->w);
++
++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
++ c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
++ 0, woff_b(s, c->wo_u2), mx2, my2, c->w);
++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
++ c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
++ 0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++
++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == q->code_sync) {
++ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
++ break;
++ }
++ else if (link == q->code_exit) {
++ // We expect exit to occur without other sync
++ av_assert0(i == exit_n);
++ ++exit_n;
++ break;
++ }
++ else {
++ av_assert0(0);
++ }
++ }
++
++ st->qpu_mc_curr = cmd;
++ }
++ } while (exit_n == 0);
++ }
++}
++
++#undef FUNC
++#undef pixel
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform.s
+@@ -0,0 +1,444 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
++# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
++.set USE_STACK, 0
++
++# Lines that fail to assemble start with #:
++# The script insert_magic_opcodes.sh inserts the machine code directly for these.
++# HEVC VPU Transform
++#
++# Transform matrix can be thought of as
++# output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++# (a b c d) (1 2 2 1)
++# (3 4 -4 -3)
++# (5 6 6 5)
++# (7 8 -8 -7)
++#
++# x=(a c)(1 2) = 1a+5c 2a+6c
++# (5 6)
++#
++# y=(b d)(3 4) = 3b+7d 4b+8d
++# (7 8)
++#
++# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++# Final results are (u , v[::-1])
++#
++#
++# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++# Apply the even matrix first and stop before rounding
++# Then apply the odd matrix in a full manner:
++#
++# First step is to compute partial products with the first input (16 cycles)
++# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output
++# 2a 4b 6c 8d
++# 2a -4b 6c -8d
++# 1a -3b 5c -7d
++#
++# Second step is to sum partial products into final position (8 cycles)
++# 1a+3b+5c+7d
++# 2a+4b+6c+8d
++# 2a-4b+6c-8d
++# 1a-3b+5c-7d
++#
++# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++# For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++
++.equ TRANS_SHIFT, 20 - BIT_DEPTH
++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
++.equ TRANS_ASL2, 16 - TRANS_SHIFT
++
++
++hevc_trans_16x16:
++ push r6-r15, lr # TODO cut down number of used registers
++ mov r14,r3 # coeffs32
++ mov r15,r4 # num32
++ mov r3, 16*2 # Stride of transMatrix2 in bytes
++ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++ add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++ # Now use r0 to describe which matrix we are working on.
++ # Allows us to prefetch the next block of coefficients for efficiency.
++ mov r0,0 # This describes the location where we read our coefficients from
++ mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++ mov r7,16*16*2 # Total block size
++ mov r8,64*16 # Value used to swap from current to next VRF location
++ mov r4,64 # Constant used for rounding first pass
++ mov r5,TRANS_RND2 # Constant used for rounding second pass
++
++ sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
++
++ add r11,sp,64 # Space for 32 bytes before, and rounding
++ lsr r11,5
++ lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
++
++ lsr r10, r2, 16 # Number of compressed blocks stored in top short
++ extu r2,16
++ # At start of block r0,r1 point to the current block (that has already been loaded)
++ # r0 VRF location of current block
++ # r1 address of current block
++ # r2 number of 16*16 transforms to do
++ # r3 Stride of coefficients (==32)
++ # r4 TRANS_RND1 (64)
++ # r5 TRANS_RND2
++ # r6 temporary used inside col_trans16
++ # r7 16*16*2 total bytes in block
++ # r8 64*16 VRF switch locations
++ # r9 temporary in unpack_coeff for index
++ # r10 number of 16x16 transforms using compression
++ # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
++ # r12 temporary counter in unpack_coeff
++ # r13
++ # r14 Save information for 32 bit transform (coeffs location)
++ # r15 Save information for 32 bit transform (number of transforms)
++ cmp r2,0
++ beq done16x16s
++block_loop:
++ # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
++ cmp r10,0
++ mov r6, r1
++ beq not_compressed
++ sub r10, 1
++ bl unpack16x16
++not_compressed:
++ #mov r6,r1 # DEBUG without compress
++ vldh HX(0++,0)+r0,(r6 += r3) REP 16
++ #eor r0,r8
++ #add r1,r7
++ # Prefetch the next block
++ #bl unpack16x16
++ #vldh HX(0++,0)+r0,(r6 += r3) REP 16
++ #vmov HX(0++,0)+r0,0 REP 16 # DEBUG
++ #eor r0,r8
++ #sub r1,r7
++
++ # Transform the current block
++ bl col_trans_16
++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate
++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble?
++ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position
++
++ bl col_trans_16
++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate
++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag)
++
++ # Save results - note there has been a transposition during the processing so we save columns
++ vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++ # Move onto next block
++ eor r0,r8
++ add r1,r7
++
++ addcmpbgt r2,-1,0,block_loop
++done16x16s:
++
++ add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
++ # Now go and do any 32x32 transforms
++ b hevc_trans_32x32
++
++ pop r6-r15, pc
++# This returns a value in r6 that says where to load the data from.
++# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
++unpack16x16:
++# Clear out destination
++ vmov HX(0,0)+r0,0
++ mov r6, r11
++ vsth HX(0,0)+r0,(r6 += r3) REP 16
++ mov r5, r1 # Moving pointer to input coefficients
++unpack_outer_loop:
++ # Loop until we find the end
++ vldh HX(0,0)+r0,(r5) # TODO would prefetch help here while unpacking previous?
++ sub r6,r11,32
++ #add r6,pc,packed_data-$ # Packed data
++ vsth HX(0,0)+r0,(r6) # Store into packed data
++ mov r12,0
++unpack_loop:
++ ld r4,(r6)
++ add r6,r6,4
++ lsr r9,r4,16 # r9 is destination value
++ cmp r4,0 # {value,index}
++ extu r4,8
++ beq done_unpack
++ sth r9,(r11, r4)
++ addcmpblt r12,1,8,unpack_loop
++# # Read next 16
++ add r5,32
++ b unpack_outer_loop
++done_unpack:
++# # Set new load location
++ mov r6, r11
++ #add r6,pc,unpacked_data-$
++# # Restore constants
++ mov r4,64
++ mov r5,TRANS_RND2
++# pop r6-r15, pc
++ b lr
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++ add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++ # First compute partial products for a single column
++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++ # Then sum up the results and place back
++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++ addcmpblt r0,1,r6,col_trans_16_loop
++ sub r0,16 # put r0 back to its original value
++ b lr
++
++col_trans_odd_16:
++ add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++ # First compute partial products for a single column
++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++ # Then sum up the results and place back
++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++ addcmpblt r0,1,r6,col_trans_odd_16_loop
++ sub r0,16 # put r0 back to its original value
++ b lr
++
++# r1/r10 input pointer
++# r0,r4,r5,r6 free
++# r8/r9 output storage
++#
++# Store packed coefficients at r9-32
++# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
++unpack32x32:
++# Clear out destination
++ vmov HX(0,0),0
++ add r0, r9, 32*32*2 # Unpacked buffer
++ mov r4, 32
++ vsth HX(0,0),(r0 += r4) REP 64
++unpack_outer_loop32:
++ # Loop until we find the end
++ vldh HX(0,0),(r1) # TODO would prefetch help here while unpacking previous?
++ sub r6,r9,32
++ #add r6,pc,packed_data-$ # Packed data
++ vsth HX(0,0),(r6) # Store into packed data
++ mov r8,0
++unpack_loop32:
++ ld r4,(r6)
++ add r6,r6,4
++ lsr r5,r4,16 # r5 is destination value
++ cmp r4,0 # {value,index}
++ extu r4,10
++ beq done_unpack
++ sth r5,(r0, r4)
++ addcmpblt r8,1,8,unpack_loop32
++# # Read next 16
++ add r1,32
++ b unpack_outer_loop32
++done_unpack32:
++ b lr
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
++#
++# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
++hevc_trans_32x32:
++ mov r1,r14 # coeffs
++ mov r2,r15 # num
++ lsr r15,r15,16 # Number that are packed
++ extu r2,16 # Total number
++
++ # Fetch odd transform matrix
++ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++ #add r0, 16*16*2
++ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++ mov r7, 16*16*2 # Total block size
++
++.if USE_STACK
++ # Stack base allocation
++ sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
++ # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
++ add r8,sp,63
++ lsr r8,5
++ lsl r8,5
++.else
++#:version r8
++ .half 0x00e8 #AUTOINSERTED
++ btst r8,16
++#:add r8,pc,intermediate_results-$
++ .half 0xbfe8
++ .half intermediate_results-($-2)
++ beq on_vpu1
++ add r8,r8,32*32*2*2+16*2 # Move to secondary storage
++on_vpu1:
++.endif
++ mov r9,r8 # Backup of the temporary storage
++ mov r10,r1 # Backup of the coefficient buffer
++
++ cmp r2,0
++ beq done32x32s
++block_loop32:
++
++ # Transform the first 16 columns
++ mov r1,r10 # Input Coefficient buffer
++ mov r8,r9 # Output temporary storage
++ # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
++ cmp r2,r15
++ bgt not_compressed_32
++ bl unpack32x32
++ add r1,r9,32*32*2 # Uncompressed into temporary storage
++ mov r8,r9 # Transform into here
++not_compressed_32:
++ # COLUMN TRANSFORM
++ mov r4, 64 # Constant used for rounding first pass
++ mov r5, 9 # left shift used for rounding first pass
++
++ bl trans32
++ # Transform the second 16 columns
++ add r8,32*16*2
++ add r1,32
++ bl trans32
++
++ # ROW TRANSFORM
++ mov r4, TRANS_RND2 # Constant used for rounding second pass
++ mov r5, TRANS_ASL2 # left shift used for rounding second pass
++
++ mov r1,r9 # Input temporary storage
++ mov r8,r10 # Output Coefficient buffer
++ bl trans32
++ # Transform the second 16 columns
++ add r8,32*16*2
++ add r1,32
++ bl trans32
++
++ add r10, 32*32*2 # move onto next block of coefficients
++ addcmpbgt r2,-1,0,block_loop32
++done32x32s:
++
++.if USE_STACK
++ add sp,sp,32*32*4+64# Restore stack
++.endif
++
++ pop r6-r15, pc
++
++trans32:
++ push lr
++ # We can no longer afford the VRF space to do prefetching when doing 32x32
++ # Fetch the even rows
++ vldh HX(0++,0),(r1 += r3) REP 16
++ # Fetch the odd rows
++ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++ # Transform the even rows using even matrix
++ mov r0, 0 # Even rows
++ bl col_trans_16
++
++ # Now transform the odd rows using odd matrix
++ mov r0, 64*16 # Odd rows
++ bl col_trans_odd_16
++
++ # Now apply butterfly to compute the first 16 results
++ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
++ # 16bit results now in HX(48,32)
++ mov r0,r8
++ mov r6,32*2
++ vsth VX(48,32++),(r0+=r6) REP 16
++
++ # Now apply butterfly to compute the second 16 results (in reverse order)
++ vsub HY(63,0),HY(0 ,0),HY(16,0)
++ vsub HY(62,0),HY(1 ,0),HY(17,0)
++ vsub HY(61,0),HY(2 ,0),HY(18,0)
++ vsub HY(60,0),HY(3 ,0),HY(19,0)
++ vsub HY(59,0),HY(4 ,0),HY(20,0)
++ vsub HY(58,0),HY(5 ,0),HY(21,0)
++ vsub HY(57,0),HY(6 ,0),HY(22,0)
++ vsub HY(56,0),HY(7 ,0),HY(23,0)
++ vsub HY(55,0),HY(8 ,0),HY(24,0)
++ vsub HY(54,0),HY(9 ,0),HY(25,0)
++ vsub HY(53,0),HY(10,0),HY(26,0)
++ vsub HY(52,0),HY(11,0),HY(27,0)
++ vsub HY(51,0),HY(12,0),HY(28,0)
++ vsub HY(50,0),HY(13,0),HY(29,0)
++ vsub HY(49,0),HY(14,0),HY(30,0)
++ vsub HY(48,0),HY(15,0),HY(31,0)
++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
++ add r0,r8,32
++ vsth VX(48,32++),(r0+=r6) REP 16
++ pop pc
++
++.if USE_STACK == 0
++ .balign 32
++
++# .space directives generate 0's in the bin so avoid unnecessary padding by
++# just setting to appropriate value
++.equ intermediate_results, $+16*2
++
++# Layout goes:
++#
++#packed_buffer:
++# .space 16*2
++#intermediate_results:
++# .space 32*32*2
++#unpacked_buffer:
++# .space 32*32*2
++#
++#packed_buffer2:
++# .space 16*2
++#intermediate_results2:
++# .space 32*32*2
++#unpacked_buffer2:
++# .space 32*32*2
++.endif
++
++
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform10.h
+@@ -0,0 +1,94 @@
++static const unsigned char rpi_hevc_transform10 [] = {
++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000
++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008
++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010
++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018
++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020
++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028
++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030
++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038
++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040
++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048
++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050
++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058
++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060
++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068
++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070
++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078
++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080
++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088
++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090
++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098
++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0
++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8
++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0
++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8
++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0
++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8
++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0
++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8
++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0
++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8
++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0
++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8
++0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100
++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108
++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110
++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118
++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120
++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128
++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130
++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138
++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140
++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148
++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150
++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158
++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160
++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168
++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170
++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178
++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180
++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188
++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190
++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198
++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0
++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8
++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0
++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8
++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0
++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8
++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0
++0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8
++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0
++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8
++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0
++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8
++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200
++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208
++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210
++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218
++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220
++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228
++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230
++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238
++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240
++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248
++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250
++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258
++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260
++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268
++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270
++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278
++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280
++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288
++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290
++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298
++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0
++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8
++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0
++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8
++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0
++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8
++};
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform8.h
+@@ -0,0 +1,94 @@
++static const unsigned char rpi_hevc_transform8 [] = {
++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000
++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008
++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010
++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018
++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020
++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028
++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030
++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038
++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040
++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048
++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050
++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058
++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060
++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068
++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070
++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078
++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080
++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088
++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090
++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098
++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0
++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8
++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0
++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8
++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0
++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8
++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0
++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8
++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0
++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8
++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0
++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8
++0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100
++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108
++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110
++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118
++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120
++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128
++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130
++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138
++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140
++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148
++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150
++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158
++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160
++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168
++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170
++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178
++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180
++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188
++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190
++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198
++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0
++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8
++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0
++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8
++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0
++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8
++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0
++0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8
++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0
++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8
++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0
++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8
++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200
++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208
++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210
++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218
++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220
++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228
++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230
++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238
++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240
++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248
++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250
++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258
++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260
++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268
++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270
++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278
++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280
++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288
++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290
++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298
++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0
++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8
++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0
++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8
++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0
++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0
++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8
++};
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.c
+@@ -0,0 +1,6134 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Wassim Hamidouche
++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++#include "libavutil/display.h"
++#include "libavutil/internal.h"
++#include "libavutil/mastering_display_metadata.h"
++#include "libavutil/md5.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/stereo3d.h"
++
++#include "decode.h"
++#include "bswapdsp.h"
++#include "bytestream.h"
++#include "golomb.h"
++#include "hevc.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_parse.h"
++#include "rpi_hevcdec.h"
++#include "rpi_hevc_cabac_fns.h"
++#include "profiles.h"
++#include "hwconfig.h"
++
++#include "rpi_zc_frames.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "pthread.h"
++#include <stdatomic.h>
++
++#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards
++
++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
++
++#ifndef av_mod_uintp2
++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
++{
++ return a & ((1 << p) - 1);
++}
++# define av_mod_uintp2 av_mod_uintp2_c
++#endif
++
++const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
++
++#define MC_DUMMY_X (-32)
++#define MC_DUMMY_Y (-32)
++
++// UV & Y both have min 4x4 pred (no 2x2 chroma)
++// Allow for even spread +1 for setup, +1 for rounding
++// As we have load sharing this can (in theory) be exceeded so we have to
++// check after each CTU, but it is a good base size
++
++// Worst case (all 4x4) commands per CTU
++#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
++#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
++
++#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
++
++#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
++#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
++
++#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
++#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
++
++// Total cmds to allocate - allow for slack & setup
++#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
++#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
++
++#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
++#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
++
++// The QPU code for UV blocks only works up to a block width of 8
++#define RPI_CHROMA_BLOCK_WIDTH 8
++
++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
++
++
++// Actual filter goes -ve, +ve, +ve, -ve using these values
++static const uint32_t rpi_filter_coefs[8] = {
++ ENCODE_COEFFS( 0, 64, 0, 0),
++ ENCODE_COEFFS( 2, 58, 10, 2),
++ ENCODE_COEFFS( 4, 54, 16, 2),
++ ENCODE_COEFFS( 6, 46, 28, 4),
++ ENCODE_COEFFS( 4, 36, 36, 4),
++ ENCODE_COEFFS( 4, 28, 46, 6),
++ ENCODE_COEFFS( 2, 16, 54, 4),
++ ENCODE_COEFFS( 2, 10, 58, 2)
++};
++
++// Function arrays by QPU
++
++static const int * const inter_pred_setup_c_qpu[12] = {
++ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
++};
++
++static const int * const inter_pred_setup_c10_qpu[12] = {
++ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
++};
++
++static const int * const inter_pred_setup_y_qpu[12] = {
++ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
++};
++
++static const int * const inter_pred_setup_y10_qpu[12] = {
++ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
++};
++
++static const int * const inter_pred_sync_qpu[12] = {
++ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
++ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
++ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
++};
++
++static const int * const inter_pred_sync10_qpu[12] = {
++ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
++ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
++ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
++};
++
++static const int * const inter_pred_exit_c_qpu[12] = {
++ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
++};
++
++static const int * const inter_pred_exit_c10_qpu[12] = {
++ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
++};
++
++static const int * const inter_pred_exit_y_qpu[12] = {
++ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
++};
++
++static const int * const inter_pred_exit_y10_qpu[12] = {
++ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
++};
++
++typedef struct ipe_chan_info_s
++{
++ const uint8_t bit_depth;
++ const uint8_t n;
++ const int * const * setup_fns;
++ const int * const * sync_fns;
++ const int * const * exit_fns;
++} ipe_chan_info_t;
++
++typedef struct ipe_init_info_s
++{
++ ipe_chan_info_t luma;
++ ipe_chan_info_t chroma;
++} ipe_init_info_t;
++
++static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
++{
++ switch (ln)
++ {
++ default: // normally 0
++ *b = a;
++ break;
++ case 1:
++ a |= a << 8;
++ *(uint16_t *)b = a;
++ b += stride;
++ *(uint16_t *)b = a;
++ break;
++ case 2:
++ a |= a << 8;
++ a |= a << 16;
++ *(uint32_t *)b = a;
++ b += stride;
++ *(uint32_t *)b = a;
++ b += stride;
++ *(uint32_t *)b = a;
++ b += stride;
++ *(uint32_t *)b = a;
++ break;
++ case 3:
++ {
++ unsigned int i;
++ uint64_t d;
++ a |= a << 8;
++ a |= a << 16;
++ d = ((uint64_t)a << 32) | a;
++ for (i = 0; i != 8; ++i, b += stride)
++ *(uint64_t *)b = d;
++ break;
++ }
++ case 4:
++ {
++ unsigned int i;
++ uint64_t d;
++ a |= a << 8;
++ a |= a << 16;
++ d = ((uint64_t)a << 32) | a;
++ for (i = 0; i != 16; ++i, b += stride)
++ {
++ *(uint64_t *)b = d;
++ *(uint64_t *)(b + 8) = d;
++ }
++ break;
++ }
++ }
++}
++
++// We expect this to be called with ln = (log2_cb_size - 3) so range = -1..3
++// (4 not required)
++static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
++{
++ switch (ln)
++ {
++ default: // 0 or -1
++ *b_u = a;
++ *b_l = a;
++ break;
++ case 1:
++ a |= a << 8;
++ *(uint16_t *)b_u = a;
++ *(uint16_t *)b_l = a;
++ break;
++ case 2:
++ a |= a << 8;
++ a |= a << 16;
++ *(uint32_t *)b_u = a;
++ *(uint32_t *)b_l = a;
++ break;
++ case 3:
++ a |= a << 8;
++ a |= a << 16;
++ *(uint32_t *)b_u = a;
++ *(uint32_t *)(b_u + 4) = a;
++ *(uint32_t *)b_l = a;
++ *(uint32_t *)(b_l + 4) = a;
++ break;
++ case 4:
++ a |= a << 8;
++ a |= a << 16;
++ *(uint32_t *)b_u = a;
++ *(uint32_t *)(b_u + 4) = a;
++ *(uint32_t *)(b_u + 8) = a;
++ *(uint32_t *)(b_u + 12) = a;
++ *(uint32_t *)b_l = a;
++ *(uint32_t *)(b_l + 4) = a;
++ *(uint32_t *)(b_l + 8) = a;
++ *(uint32_t *)(b_l + 12) = a;
++ break;
++ }
++}
++
++static void zap_cabac_stash(uint8_t * b, const int ln)
++{
++ switch (ln)
++ {
++ default: // 0
++ *b = 0;
++ break;
++ case 1:
++ *(uint16_t *)b = 0;
++ break;
++ case 2:
++ *(uint32_t *)b = 0;
++ break;
++ case 3:
++ *(uint32_t *)b = 0;
++ *(uint32_t *)(b + 4) = 0;
++ break;
++ }
++}
++
++
++
++// Set a small square block of bits in a bitmap
++// Bits must be aligned on their size boundry (which will be true of all split CBs)
++static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
++{
++ unsigned int n;
++ const unsigned int sh = (x & 7);
++
++ f += (x >> 3);
++
++ av_assert2(ln <= 3);
++ av_assert2((x & ((1 << ln) - 1)) == 0);
++
++ switch (ln)
++ {
++ default: // 1
++ f[0] |= 1 << sh;
++ break;
++ case 1: // 3 * 2
++ n = 3 << sh;
++ f[0] |= n;
++ f[stride] |= n;
++ break;
++ case 2: // 0xf * 4
++ n = 0xf << sh;
++ f[0] |= n;
++ f[stride] |= n;
++ f[stride * 2] |= n;
++ f[stride * 3] |= n;
++ break;
++ case 3: // 0xff * 8
++ for (n = 0; n != 8; ++n, f += stride)
++ *f = 0xff;
++ break;
++ }
++}
++
++static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16
++ { // 8
++ .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
++ .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
++ },
++ { // 9
++ .luma = {0},
++ .chroma = {0}
++ },
++ { // 10
++ .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
++ .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
++ }
++
++};
++
++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
++{
++ const unsigned int n = ici->n;
++ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word
++
++ ipe->n = n;
++ ipe->max_fill = q1_size - ipe->min_gap;
++ for(unsigned int i = 0; i < n; i++) {
++ HEVCRpiInterPredQ * const q = ipe->q + i;
++ q->qpu_mc_curr = q->qpu_mc_base =
++ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++ q->code_setup = qpu_fn(ici->setup_fns[i]);
++ q->code_sync = qpu_fn(ici->sync_fns[i]);
++ q->code_exit = qpu_fn(ici->exit_fns[i]);
++ }
++}
++
++static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
++{
++ av_assert0(bit_depth >= 8 && bit_depth <= 16);
++
++ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
++}
++
++// Unsigned Trivial MOD
++static inline unsigned int utmod(const unsigned int x, const unsigned int n)
++{
++ return x >= n ? x - n : x;
++}
++
++// returns pq->job_n++
++static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
++{
++ unsigned int const x2 = pq->job_n;
++ pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
++ return x2;
++}
++
++static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
++{
++ pq->terminate = 0;
++ pq->job_n = 0;
++ pq->context = s;
++ pq->worker = worker;
++ pq->psem_out = psem_out;
++ pq->pass_n = n;
++ pq->started = 0;
++ sem_init(&pq->sem_in, 0, 0);
++}
++
++static void pass_queue_kill(HEVCRpiPassQueue * const pq)
++{
++ sem_destroy(&pq->sem_in);
++}
++
++static inline void rpi_sem_wait(sem_t * const sem)
++{
++ while (sem_wait(sem) != 0) {
++ av_assert0(errno == EINTR);
++ }
++}
++
++static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
++{
++ sem_post(&pq->sem_in);
++}
++
++static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ // Do the various passes - common with the worker code
++ for (unsigned int i = 0; i != RPI_PASSES; ++i) {
++ s->passq[i].worker(s, jb);
++ }
++}
++
++
++#if 0
++static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
++{
++ int x;
++ sem_getvalue((sem_t *)&jbc->sem_out, &x);
++ printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
++}
++#endif
++
++
++static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJob * jb;
++ HEVCRpiJobGlobal * const jbg = jbc->jbg;
++
++ pthread_mutex_lock(&jbg->lock);
++ // Check local 1st
++ if ((jb = jbc->jb1) != NULL)
++ {
++ // Only 1 - very easy :-)
++ jbc->jb1 = NULL;
++ }
++ else
++ {
++ // Now look for global free chain
++ if ((jb = jbg->free1) != NULL)
++ {
++ // Found one - unlink it
++ jbg->free1 = jb->next;
++ jb->next = NULL;
++ }
++ else
++ {
++ // Out of places to look - wait for one to become free - add to Qs
++
++ // Global
++ // If "good" lc then add after the last "good" el in the chain
++ // otherwise add to the tail
++ if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
++ {
++ // Add to end as we had to wait last time or wait Q empty
++ if ((lc->jw_prev = jbg->wait_tail) == NULL)
++ jbg->wait_head = lc;
++ else
++ lc->jw_prev->jw_next = lc;
++ lc->jw_next = NULL;
++ jbg->wait_tail = lc;
++ }
++ else
++ {
++ // This is a "good" lc that we need to poke into the middle
++ // of the Q
++ // We know that the Q isn't empty and there is at least one
++ // !last_progess_good el in it from the previous test
++
++ HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
++
++ if (p == NULL)
++ {
++ // No current good els - add to head
++ lc->jw_next = jbg->wait_head;
++ jbg->wait_head = lc;
++ }
++ else
++ {
++ lc->jw_next = p->jw_next;
++ p->jw_next = lc;
++ }
++
++ lc->jw_next->jw_prev = lc;
++ lc->jw_prev = p;
++ }
++
++ // If "good" then we are now the last good waiting el
++ if (lc->last_progress_good)
++ jbg->wait_good = lc;
++
++ // Local
++ if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
++ jbc->lcw_head = lc;
++ else
++ lc->ljw_prev->ljw_next = lc;
++ lc->ljw_next = NULL;
++ jbc->lcw_tail = lc;
++ }
++ }
++
++ pthread_mutex_unlock(&jbg->lock);
++
++ if (jb == NULL) // Need to wait
++ {
++ rpi_sem_wait(&lc->jw_sem);
++ jb = lc->jw_job; // Set by free code
++ }
++
++ return jb;
++}
++
++
++static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
++{
++ HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock
++ HEVCRpiJobCtl * jbc = jb->jbc_local;
++ HEVCRpiLocalContext * lc = NULL;
++
++ pthread_mutex_lock(&jbg->lock);
++
++ if (jbc != NULL)
++ {
++ av_assert1(jbc->jb1 == NULL);
++
++ // Release to Local if nothing waiting there
++ if ((lc = jbc->lcw_head) == NULL)
++ jbc->jb1 = jb;
++ }
++ else
++ {
++ // Release to global if nothing waiting there
++ if ((lc = jbg->wait_head) == NULL)
++ {
++ jb->next = jbg->free1;
++ jbg->free1 = jb;
++ }
++ else
++ {
++ // ? seems somehow mildy ugly...
++ jbc = lc->context->jbc;
++ }
++ }
++
++ if (lc != NULL)
++ {
++ // Something was waiting
++
++ // Unlink
++ // Global
++ if (lc->jw_next == NULL)
++ jbg->wait_tail = lc->jw_prev;
++ else
++ lc->jw_next->jw_prev = lc->jw_prev;
++
++ if (lc->jw_prev == NULL)
++ jbg->wait_head = lc->jw_next;
++ else
++ lc->jw_prev->jw_next = lc->jw_next;
++
++ // Local
++ if (lc->ljw_next == NULL)
++ jbc->lcw_tail = lc->ljw_prev;
++ else
++ lc->ljw_next->ljw_prev = lc->ljw_prev;
++
++ if (lc->ljw_prev == NULL)
++ jbc->lcw_head = lc->ljw_next;
++ else
++ lc->ljw_prev->ljw_next = lc->ljw_next;
++
++ // Update good if required
++ if (jbg->wait_good == lc)
++ jbg->wait_good = lc->jw_prev;
++
++ // Prod
++ lc->jw_job = jb;
++ sem_post(&lc->jw_sem);
++ }
++
++ pthread_mutex_unlock(&jbg->lock);
++}
++
++static void job_lc_kill(HEVCRpiLocalContext * const lc)
++{
++ sem_destroy(&lc->jw_sem);
++}
++
++static void job_lc_init(HEVCRpiLocalContext * const lc)
++{
++ lc->jw_next = NULL;
++ lc->jw_prev = NULL;
++ lc->ljw_next = NULL;
++ lc->ljw_prev = NULL;
++ lc->jw_job = NULL;
++ sem_init(&lc->jw_sem, 0, 0);
++}
++
++// Returns:
++// 0 if we have waited for MV or expect to wait for recon
++// 1 if we haven't waited for MV & do not need to wait for recon
++static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
++{
++ if (jb->waited) // reset by rpi_begin
++ return 0;
++ for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
++ {
++ if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
++ ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
++ return 0;
++ }
++ return 1;
++}
++
++// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
++static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl *const jbc = s->jbc;
++ HEVCRpiJob * const jb = lc->jb0;
++
++ av_assert1(jb != NULL);
++
++ if (jb->ctu_ts_last < 0) {
++ return;
++ }
++
++ lc->last_progress_good = progress_good(s, jb);
++ jb->waited = !lc->last_progress_good;
++ lc->jb0 = NULL;
++
++ if (s->offload_recon)
++ {
++ pthread_mutex_lock(&jbc->in_lock);
++ jbc->offloadq[jbc->offload_in] = jb;
++ jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
++ pthread_mutex_unlock(&jbc->in_lock);
++
++ pass_queue_submit_job(s->passq + 0); // Consumes job eventually
++ }
++ else
++ {
++ pass_queue_do_all(s, jb); // Consumes job before return
++ }
++}
++
++
++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
++// available to receive the next job.
++//
++// Now safe against multiple callers - needed for tiles
++// "normal" and WPP will only call here one at a time
++static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl * const jbc = s->jbc;
++
++ // It is legit for us to already have a job allocated - do nothing in this case
++ if (lc->jb0 != NULL)
++ return;
++
++ if (s->offload_recon)
++ rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much
++
++ lc->jb0 = job_alloc(jbc, lc);
++
++ rpi_begin(s, lc->jb0, lc->ts);
++}
++
++// Free up a job without submission
++static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl * const jbc = s->jbc;
++ HEVCRpiJob * const jb = lc->jb0;
++
++ if (jb == NULL) {
++ return;
++ }
++
++ lc->jb0 = NULL;
++
++ job_free(jbc, jb);
++
++ // If offload then poke sem_out too
++ if (s->offload_recon) {
++ sem_post(&jbc->sem_out);
++ }
++}
++
++
++// Call this to wait for all jobs to have completed at the end of a frame
++// Slightly icky as there is no clean way to wait for a sem to count up
++// Not reentrant - call on main thread only
++static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl * const jbc = s->jbc;
++ int i = 0;
++
++ // We shouldn't reach here with an unsubmitted job
++ av_assert1(lc->jb0 == NULL);
++
++ // If no offload then there can't be anything to wait for
++ if (!s->offload_recon) {
++ return;
++ }
++
++ if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
++ {
++ for (i = 0; i != RPI_MAX_JOBS; ++i) {
++ rpi_sem_wait(&jbc->sem_out);
++ }
++ for (i = 0; i != RPI_MAX_JOBS; ++i) {
++ sem_post(&jbc->sem_out);
++ }
++ }
++}
++
++static void * pass_worker(void *arg)
++{
++ HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
++ HEVCRpiContext *const s = pq->context;
++
++ for (;;)
++ {
++ rpi_sem_wait(&pq->sem_in);
++
++ if (pq->terminate)
++ break;
++
++ pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
++ // * should really set jb->passes_done here
++
++ sem_post(pq->psem_out);
++ }
++ return NULL;
++}
++
++static void pass_queues_start_all(HEVCRpiContext *const s)
++{
++ unsigned int i;
++ HEVCRpiPassQueue * const pqs = s->passq;
++
++ for (i = 0; i != RPI_PASSES; ++i)
++ {
++ av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
++ pqs[i].started = 1;
++ }
++}
++
++static void pass_queues_term_all(HEVCRpiContext *const s)
++{
++ unsigned int i;
++ HEVCRpiPassQueue * const pqs = s->passq;
++
++ for (i = 0; i != RPI_PASSES; ++i)
++ pqs[i].terminate = 1;
++ for (i = 0; i != RPI_PASSES; ++i)
++ {
++ if (pqs[i].started)
++ sem_post(&pqs[i].sem_in);
++ }
++ for (i = 0; i != RPI_PASSES; ++i)
++ {
++ if (pqs[i].started) {
++ pthread_join(pqs[i].thread, NULL);
++ pqs[i].started = 0;
++ }
++ }
++}
++
++static void pass_queues_kill_all(HEVCRpiContext *const s)
++{
++ unsigned int i;
++ HEVCRpiPassQueue * const pqs = s->passq;
++
++ for (i = 0; i != RPI_PASSES; ++i)
++ pass_queue_kill(pqs + i);
++}
++
++
++static void worker_pic_free_one(HEVCRpiJob * const jb)
++{
++ // Free coeff stuff - allocation not the same for all buffers
++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++ if (cf->s[0].buf != NULL)
++ av_freep(&cf->mptr);
++ if (cf->s[2].buf != NULL)
++ gpu_free(&cf->gptr);
++ memset(cf, 0, sizeof(*cf));
++}
++
++static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
++{
++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
++ goto fail;
++ cf->s[2].buf = (int16_t *)cf->gptr.arm;
++ cf->s[3].buf = cf->s[2].buf + coeff_count;
++
++ // Must be 64 byte aligned for our zero zapping code so over-allocate &
++ // round
++ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
++ goto fail;
++ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
++ return 0;
++
++fail:
++ av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
++ worker_pic_free_one(jb);
++ return -1;
++}
++
++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++{
++ unsigned int i;
++ for (i = 0; i != 4; ++i) {
++ cf->s[i].n = 0;
++#if RPI_COMPRESS_COEFFS
++ cf->s[i].packed = 1;
++ cf->s[i].packed_n = 0;
++#endif
++ }
++}
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
++{
++ HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
++ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
++ cfe->n += n;
++ return coeffs;
++}
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCRpiFrame * const ref, const int val, const int field)
++{
++ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
++ HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
++ HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
++ sem_t * sem = NULL;
++
++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++ if (((volatile int *)ref->tf.progress->data)[field] < val) {
++ HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
++
++ av_assert1(pwait->req == -1 && pwait->next == NULL);
++ jb->waited = 1; // Remember that we had to wait for later scheduling
++
++ pwait->req = val;
++ pwait->next = NULL;
++ if (pstate->first == NULL)
++ pstate->first = pwait;
++ else
++ pstate->last->next = pwait;
++ pstate->last = pwait;
++ sem = &pwait->sem;
++ }
++ pthread_mutex_unlock(&pstate->lock);
++
++ if (sem != NULL) {
++ rpi_sem_wait(sem);
++ }
++ }
++}
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
++{
++ HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
++
++ ((int *)s->ref->tf.progress->data)[field] = val;
++
++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++ {
++ HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
++ HEVCRpiFrameProgressWait * pwait;
++
++ while ((pwait = *ppwait) != NULL) {
++ if (pwait->req > val)
++ {
++ ppwait = &pwait->next;
++ pstate->last = pwait;
++ }
++ else
++ {
++ *ppwait = pwait->next;
++ pwait->req = -1;
++ pwait->next = NULL;
++ sem_post(&pwait->sem);
++ }
++ }
++ }
++ pthread_mutex_unlock(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
++{
++ pstate->first = NULL;
++ pstate->last = NULL;
++ pthread_mutex_init(&pstate->lock, NULL);
++}
++
++static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++ pwait->req = -1;
++ pwait->next = NULL;
++ sem_init(&pwait->sem, 0, 0);
++}
++
++static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
++{
++ av_assert1(pstate->first == NULL);
++ pthread_mutex_destroy(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++ sem_destroy(&pwait->sem);
++}
++
++
++/**
++ * NOTE: Each function hls_foo correspond to the function foo in the
++ * specification (HLS stands for High Level Syntax).
++ */
++
++/**
++ * Section 5.7
++ */
++
++// Realloc the entry point arrays
++static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
++{
++ if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
++ {
++ // Round up alloc to multiple of 32
++ int a = (n + 31) & ~31;
++
++ // We don't care about the previous contents so probably fastest to simply discard
++ av_freep(&sh->entry_point_offset);
++ av_freep(&sh->offset);
++ av_freep(&sh->size);
++
++ if (a != 0)
++ {
++ sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
++ sh->offset = av_malloc_array(a, sizeof(int));
++ sh->size = av_malloc_array(a, sizeof(int));
++
++ if (!sh->entry_point_offset || !sh->offset || !sh->size) {
++ sh->num_entry_point_offsets = 0;
++ sh->offsets_allocated = 0;
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ sh->offsets_allocated = a;
++ }
++
++ return 0;
++}
++
++/* free everything allocated by pic_arrays_init() */
++static void pic_arrays_free(HEVCRpiContext *s)
++{
++ av_freep(&s->sao);
++ av_freep(&s->deblock);
++
++ av_freep(&s->cabac_stash_up);
++ s->cabac_stash_left = NULL; // freed with _up
++
++ av_freep(&s->mvf_up);
++ av_freep(&s->mvf_left);
++
++ av_freep(&s->is_pcm);
++ av_freep(&s->is_intra_store);
++ s->is_intra = NULL;
++ av_freep(&s->rpl_tab);
++ s->rpl_tab_size = 0;
++
++ av_freep(&s->qp_y_tab);
++ av_freep(&s->tab_slice_address);
++ av_freep(&s->filter_slice_edges);
++
++ av_freep(&s->bs_horizontal);
++ s->bs_vertical = NULL; // freed with H
++ av_freep(&s->bsf_stash_left);
++ av_freep(&s->bsf_stash_up);
++
++ av_freep(&s->rpl_up);
++ av_freep(&s->rpl_left);
++
++ alloc_entry_points(&s->sh, 0);
++
++ av_buffer_pool_uninit(&s->col_mvf_pool);
++}
++
++/* allocate arrays that depend on frame dimensions */
++static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
++{
++ const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
++ const unsigned int width = sps->width;
++ const unsigned int height = sps->height;
++ const unsigned int pic_size_in_cb = ((width >> log2_min_cb_size) + 1) *
++ ((height >> log2_min_cb_size) + 1);
++ const unsigned int ctb_count = sps->ctb_size;
++
++ {
++ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
++ unsigned int h = ((height + 15) & ~15);
++
++ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
++ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
++ }
++
++ s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
++ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock));
++ if (!s->sao || !s->deblock)
++ goto fail;
++
++ s->cabac_stash_up = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
++ s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
++ if (s->cabac_stash_up == NULL)
++ goto fail;
++
++ // Round width up to max ctb size
++ s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++ // * Only needed if we have H tiles
++ s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++
++ // We can overread by 1 line & one byte in deblock so alloc & zero
++ // We don't need to zero the extra @ start of frame as it will never be
++ // written
++ s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
++ s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
++ if (s->is_pcm == NULL || s->is_intra_store == NULL)
++ goto fail;
++
++ s->filter_slice_edges = av_mallocz(ctb_count);
++ s->tab_slice_address = av_malloc_array(ctb_count,
++ sizeof(*s->tab_slice_address));
++ s->qp_y_tab = av_malloc_array(pic_size_in_cb,
++ sizeof(*s->qp_y_tab));
++ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
++ goto fail;
++
++ s->bs_horizontal = av_mallocz(s->bs_size * 2);
++ s->bs_vertical = s->bs_horizontal + s->bs_size;
++ if (s->bs_horizontal == NULL)
++ goto fail;
++
++ s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
++ s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
++ if (s->rpl_left == NULL || s->rpl_up == NULL)
++ goto fail;
++
++ if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
++ (s->bsf_stash_up = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
++ goto fail;
++
++ s->col_mvf_stride = (width + 15) >> 4;
++ s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
++ av_buffer_allocz);
++ if (s->col_mvf_pool == NULL)
++ goto fail;
++
++ return 0;
++
++fail:
++ pic_arrays_free(s);
++ return AVERROR(ENOMEM);
++}
++
++static void default_pred_weight_table(HEVCRpiContext * const s)
++{
++ unsigned int i;
++ const unsigned int wt = 1 << QPU_MC_DENOM;
++ s->sh.luma_log2_weight_denom = 0;
++ s->sh.chroma_log2_weight_denom = 0;
++ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++ s->sh.luma_weight_l0[i] = wt;
++ s->sh.luma_offset_l0[i] = 0;
++ s->sh.chroma_weight_l0[i][0] = wt;
++ s->sh.chroma_weight_l0[i][1] = wt;
++ s->sh.chroma_offset_l0[i][0] = 0;
++ s->sh.chroma_offset_l0[i][1] = 0;
++ }
++ for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++ s->sh.luma_weight_l1[i] = wt;
++ s->sh.luma_offset_l1[i] = 0;
++ s->sh.chroma_weight_l1[i][0] = wt;
++ s->sh.chroma_weight_l1[i][1] = wt;
++ s->sh.chroma_offset_l1[i][0] = 0;
++ s->sh.chroma_offset_l1[i][1] = 0;
++ }
++}
++
++static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
++ const unsigned int refs,
++ int16_t * luma_weight, int16_t * luma_offset,
++ int16_t * chroma_weight, int16_t * chroma_offset)
++{
++ unsigned int luma_flags;
++ unsigned int chroma_flags;
++ unsigned int i;
++ const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
++ const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
++ const unsigned int luma_weight_base = 1 << QPU_MC_DENOM;
++ const unsigned int chroma_weight_base = 1 << QPU_MC_DENOM;
++ const unsigned int luma_weight_shift = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
++ const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
++
++ if (refs == 0)
++ return 0;
++
++ luma_flags = get_bits(gb, refs);
++ chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
++ i = 1 << (refs - 1);
++
++ do
++ {
++ if ((luma_flags & i) != 0)
++ {
++ const int delta_weight = get_se_golomb(gb);
++ const int offset = get_se_golomb(gb);
++ if (delta_weight < -128 || delta_weight > 127 ||
++ offset < -wp_offset_half_range || offset >= wp_offset_half_range)
++ {
++ return AVERROR_INVALIDDATA;
++ }
++ *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
++ *luma_offset++ = offset << wp_offset_bd_shift;
++ }
++ else
++ {
++ *luma_weight++ = luma_weight_base;
++ *luma_offset++ = 0;
++ }
++
++ if ((chroma_flags & i) != 0)
++ {
++ unsigned int j;
++ for (j = 0; j != 2; ++j)
++ {
++ const int delta_weight = get_se_golomb(gb);
++ const int delta_offset = get_se_golomb(gb);
++
++ if (delta_weight < -128 || delta_weight > 127 ||
++ delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
++ {
++ return AVERROR_INVALIDDATA;
++ }
++
++ *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
++ *chroma_offset++ = av_clip(
++ wp_offset_half_range + delta_offset -
++ ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
++ -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
++ }
++ }
++ else
++ {
++ *chroma_weight++ = chroma_weight_base;
++ *chroma_weight++ = chroma_weight_base;
++ *chroma_offset++ = 0;
++ *chroma_offset++ = 0;
++ }
++ } while ((i >>= 1) != 0);
++
++ return 0;
++}
++
++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
++{
++ int err;
++ const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
++ const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
++
++ if (luma_log2_weight_denom > 7 ||
++ chroma_log2_weight_denom > 7)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
++ luma_log2_weight_denom, chroma_log2_weight_denom);
++ return AVERROR_INVALIDDATA;
++ }
++
++ s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
++ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
++
++ if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
++ s->sh.luma_weight_l0, s->sh.luma_offset_l0,
++ s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
++ (err = get_weights(s, gb, s->sh.nb_refs[L1],
++ s->sh.luma_weight_l1, s->sh.luma_offset_l1,
++ s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
++ return err;
++ }
++
++ return 0;
++}
++
++static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
++{
++ const HEVCRpiSPS *sps = s->ps.sps;
++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb;
++ int prev_delta_msb = 0;
++ unsigned int nb_sps = 0, nb_sh;
++ int i;
++
++ rps->nb_refs = 0;
++ if (!sps->long_term_ref_pics_present_flag)
++ return 0;
++
++ if (sps->num_long_term_ref_pics_sps > 0)
++ nb_sps = get_ue_golomb_long(gb);
++ nb_sh = get_ue_golomb_long(gb);
++
++ if (nb_sps > sps->num_long_term_ref_pics_sps)
++ return AVERROR_INVALIDDATA;
++ if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
++ return AVERROR_INVALIDDATA;
++
++ rps->nb_refs = nb_sh + nb_sps;
++
++ for (i = 0; i < rps->nb_refs; i++) {
++ uint8_t delta_poc_msb_present;
++
++ if (i < nb_sps) {
++ uint8_t lt_idx_sps = 0;
++
++ if (sps->num_long_term_ref_pics_sps > 1)
++ lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
++
++ rps->poc[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
++ rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
++ } else {
++ rps->poc[i] = get_bits(gb, sps->log2_max_poc_lsb);
++ rps->used[i] = get_bits1(gb);
++ }
++
++ delta_poc_msb_present = get_bits1(gb);
++ if (delta_poc_msb_present) {
++ int64_t delta = get_ue_golomb_long(gb);
++ int64_t poc;
++
++ if (i && i != nb_sps)
++ delta += prev_delta_msb;
++
++ poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
++ if (poc != (int32_t)poc)
++ return AVERROR_INVALIDDATA;
++ rps->poc[i] = poc;
++ prev_delta_msb = delta;
++ }
++ }
++
++ return 0;
++}
++
++static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
++ const HEVCRpiSPS *sps)
++{
++ const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
++ const HEVCRpiWindow *ow = &sps->output_window;
++ unsigned int num = 0, den = 0;
++
++ avctx->pix_fmt = sps->pix_fmt;
++ avctx->coded_width = sps->width;
++ avctx->coded_height = sps->height;
++ avctx->width = sps->width - ow->left_offset - ow->right_offset;
++ avctx->height = sps->height - ow->top_offset - ow->bottom_offset;
++ avctx->has_b_frames = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
++ avctx->profile = sps->ptl.general_ptl.profile_idc;
++ avctx->level = sps->ptl.general_ptl.level_idc;
++
++ ff_set_sar(avctx, sps->vui.sar);
++
++ if (sps->vui.video_signal_type_present_flag)
++ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
++ : AVCOL_RANGE_MPEG;
++ else
++ avctx->color_range = AVCOL_RANGE_MPEG;
++
++ if (sps->vui.colour_description_present_flag) {
++ avctx->color_primaries = sps->vui.colour_primaries;
++ avctx->color_trc = sps->vui.transfer_characteristic;
++ avctx->colorspace = sps->vui.matrix_coeffs;
++ } else {
++ avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
++ avctx->color_trc = AVCOL_TRC_UNSPECIFIED;
++ avctx->colorspace = AVCOL_SPC_UNSPECIFIED;
++ }
++
++ if (vps->vps_timing_info_present_flag) {
++ num = vps->vps_num_units_in_tick;
++ den = vps->vps_time_scale;
++ } else if (sps->vui.vui_timing_info_present_flag) {
++ num = sps->vui.vui_num_units_in_tick;
++ den = sps->vui.vui_time_scale;
++ }
++
++ if (num != 0 && den != 0)
++ av_reduce(&avctx->framerate.den, &avctx->framerate.num,
++ num, den, 1 << 30);
++}
++
++static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
++{
++ enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
++
++ // Admit to no h/w formats
++
++ *fmt++ = sps->pix_fmt;
++ *fmt = AV_PIX_FMT_NONE;
++
++ return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
++}
++
++static int is_sps_supported(const HEVCRpiSPS * const sps)
++{
++ return av_rpi_is_sand_format(sps->pix_fmt) &&
++ sps->width <= HEVC_RPI_MAX_WIDTH &&
++ sps->height <= HEVC_RPI_MAX_HEIGHT;
++}
++
++static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
++ const enum AVPixelFormat pix_fmt)
++{
++ int ret;
++
++ pic_arrays_free(s);
++ s->ps.sps = NULL;
++ s->ps.vps = NULL;
++
++ if (sps == NULL)
++ return 0;
++
++ if (!is_sps_supported(sps))
++ return AVERROR_DECODER_NOT_FOUND;
++
++ ret = pic_arrays_init(s, sps);
++ if (ret < 0)
++ goto fail;
++
++ export_stream_params(s->avctx, &s->ps, sps);
++
++ s->avctx->pix_fmt = pix_fmt;
++
++ ff_hevc_rpi_pred_init(&s->hpc, sps->bit_depth);
++ ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
++
++ // * We don't support cross_component_prediction_enabled_flag but as that
++ // must be 0 unless we have 4:4:4 there is no point testing for it as we
++ // only deal with sand which is never 4:4:4
++ // [support wouldn't be hard]
++
++ rpi_hevc_qpu_set_fns(s, sps->bit_depth);
++
++ av_freep(&s->sao_pixel_buffer_h[0]);
++ av_freep(&s->sao_pixel_buffer_v[0]);
++
++ if (sps->sao_enabled)
++ {
++ const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
++ unsigned int c_idx;
++ size_t vsize[3] = {0};
++ size_t hsize[3] = {0};
++
++ for(c_idx = 0; c_idx < c_count; c_idx++) {
++ int w = sps->width >> ctx_hshift(s, c_idx);
++ int h = sps->height >> ctx_vshift(s, c_idx);
++ // ctb height & width are a min of 8 so this must a multiple of 16
++ // so no point rounding up!
++ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
++ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
++ }
++
++ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
++ // when we have plaited chroma
++ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
++ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
++ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
++ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
++ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
++ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
++ }
++
++ s->ps.sps = sps;
++ s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++
++ return 0;
++
++fail:
++ pic_arrays_free(s);
++ s->ps.sps = NULL;
++ return ret;
++}
++
++static inline int qp_offset_valid(const int qp_offset)
++{
++ return qp_offset >= -12 && qp_offset <= 12;
++}
++
++static int hls_slice_header(HEVCRpiContext * const s)
++{
++ GetBitContext * const gb = &s->HEVClc->gb;
++ RpiSliceHeader * const sh = &s->sh;
++ int i, ret;
++
++ // Coded parameters
++ sh->first_slice_in_pic_flag = get_bits1(gb);
++ if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ if (IS_IDR(s))
++ ff_hevc_rpi_clear_refs(s);
++ }
++ sh->no_output_of_prior_pics_flag = 0;
++ if (IS_IRAP(s))
++ sh->no_output_of_prior_pics_flag = get_bits1(gb);
++
++ sh->pps_id = get_ue_golomb_long(gb);
++ if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
++ av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
++ return AVERROR_INVALIDDATA;
++ }
++ if (!sh->first_slice_in_pic_flag &&
++ s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
++ av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
++ return AVERROR_INVALIDDATA;
++ }
++ s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
++ sh->no_output_of_prior_pics_flag = 1;
++
++ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
++ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
++ const HEVCRpiSPS *last_sps = s->ps.sps;
++ enum AVPixelFormat pix_fmt;
++
++ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
++ if (sps->width != last_sps->width || sps->height != last_sps->height ||
++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
++ last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
++ sh->no_output_of_prior_pics_flag = 0;
++ }
++ ff_hevc_rpi_clear_refs(s);
++
++ ret = set_sps(s, sps, sps->pix_fmt);
++ if (ret < 0)
++ return ret;
++
++ pix_fmt = get_format(s, sps);
++ if (pix_fmt < 0)
++ return pix_fmt;
++
++// ret = set_sps(s, sps, pix_fmt);
++// if (ret < 0)
++// return ret;
++
++ s->avctx->pix_fmt = pix_fmt;
++
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ }
++
++ sh->dependent_slice_segment_flag = 0;
++ if (!sh->first_slice_in_pic_flag) {
++ int slice_address_length;
++
++ if (s->ps.pps->dependent_slice_segments_enabled_flag)
++ sh->dependent_slice_segment_flag = get_bits1(gb);
++
++ slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
++ sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
++ if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid slice segment address: %u.\n",
++ sh->slice_segment_addr);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (!sh->dependent_slice_segment_flag) {
++ sh->slice_addr = sh->slice_segment_addr;
++ s->slice_idx++;
++ }
++ } else {
++ sh->slice_segment_addr = sh->slice_addr = 0;
++ s->slice_idx = 0;
++ s->slice_initialized = 0;
++ }
++
++ if (!sh->dependent_slice_segment_flag) {
++ s->slice_initialized = 0;
++
++ for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
++ skip_bits(gb, 1); // slice_reserved_undetermined_flag[]
++
++ sh->slice_type = get_ue_golomb_long(gb);
++ if (!(sh->slice_type == HEVC_SLICE_I ||
++ sh->slice_type == HEVC_SLICE_P ||
++ sh->slice_type == HEVC_SLICE_B)) {
++ av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
++ sh->slice_type);
++ return AVERROR_INVALIDDATA;
++ }
++ if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
++ av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ // when flag is not present, picture is inferred to be output
++ sh->pic_output_flag = 1;
++ if (s->ps.pps->output_flag_present_flag)
++ sh->pic_output_flag = get_bits1(gb);
++
++ if (s->ps.sps->separate_colour_plane_flag)
++ sh->colour_plane_id = get_bits(gb, 2);
++
++ if (!IS_IDR(s)) {
++ int poc, pos;
++
++ sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
++ poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
++ if (!sh->first_slice_in_pic_flag && poc != s->poc) {
++ av_log(s->avctx, AV_LOG_WARNING,
++ "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
++ if (s->avctx->err_recognition & AV_EF_EXPLODE)
++ return AVERROR_INVALIDDATA;
++ poc = s->poc;
++ }
++ s->poc = poc;
++
++ sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
++ pos = get_bits_left(gb);
++ if (!sh->short_term_ref_pic_set_sps_flag) {
++ ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
++ if (ret < 0)
++ return ret;
++
++ sh->short_term_rps = &sh->slice_rps;
++ } else {
++ int numbits, rps_idx;
++
++ if (!s->ps.sps->nb_st_rps) {
++ av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
++ rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
++ sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
++ }
++ sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++ pos = get_bits_left(gb);
++ ret = decode_lt_rps(s, &sh->long_term_rps, gb);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
++ if (s->avctx->err_recognition & AV_EF_EXPLODE)
++ return AVERROR_INVALIDDATA;
++ }
++ sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++ if (s->ps.sps->sps_temporal_mvp_enabled_flag)
++ sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
++ else
++ sh->slice_temporal_mvp_enabled_flag = 0;
++ } else {
++ s->sh.short_term_rps = NULL;
++ s->poc = 0;
++ }
++
++ /* 8.3.1 */
++ if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
++ s->nal_unit_type != HEVC_NAL_TRAIL_N &&
++ s->nal_unit_type != HEVC_NAL_TSA_N &&
++ s->nal_unit_type != HEVC_NAL_STSA_N &&
++ s->nal_unit_type != HEVC_NAL_RADL_N &&
++ s->nal_unit_type != HEVC_NAL_RADL_R &&
++ s->nal_unit_type != HEVC_NAL_RASL_N &&
++ s->nal_unit_type != HEVC_NAL_RASL_R)
++ s->pocTid0 = s->poc;
++
++ if (s->ps.sps->sao_enabled) {
++ sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
++ if (ctx_cfmt(s) != 0) {
++ sh->slice_sample_adaptive_offset_flag[1] =
++ sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
++ }
++ } else {
++ sh->slice_sample_adaptive_offset_flag[0] = 0;
++ sh->slice_sample_adaptive_offset_flag[1] = 0;
++ sh->slice_sample_adaptive_offset_flag[2] = 0;
++ }
++
++ sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
++ if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
++ int nb_refs;
++
++ sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
++
++ if (get_bits1(gb)) { // num_ref_idx_active_override_flag
++ sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
++ }
++ if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
++ av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
++ sh->nb_refs[L0], sh->nb_refs[L1]);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->rpl_modification_flag[0] = 0;
++ sh->rpl_modification_flag[1] = 0;
++ nb_refs = ff_hevc_rpi_frame_nb_refs(s);
++ if (!nb_refs) {
++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
++ sh->rpl_modification_flag[0] = get_bits1(gb);
++ if (sh->rpl_modification_flag[0]) {
++ for (i = 0; i < sh->nb_refs[L0]; i++)
++ sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
++ }
++
++ if (sh->slice_type == HEVC_SLICE_B) {
++ sh->rpl_modification_flag[1] = get_bits1(gb);
++ if (sh->rpl_modification_flag[1] == 1)
++ for (i = 0; i < sh->nb_refs[L1]; i++)
++ sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
++ }
++ }
++
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->mvd_l1_zero_flag = get_bits1(gb);
++
++ if (s->ps.pps->cabac_init_present_flag)
++ sh->cabac_init_flag = get_bits1(gb);
++ else
++ sh->cabac_init_flag = 0;
++
++ sh->collocated_ref_idx = 0;
++ if (sh->slice_temporal_mvp_enabled_flag) {
++ sh->collocated_list = L0;
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->collocated_list = !get_bits1(gb);
++
++ if (sh->nb_refs[sh->collocated_list] > 1) {
++ sh->collocated_ref_idx = get_ue_golomb_long(gb);
++ if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid collocated_ref_idx: %d.\n",
++ sh->collocated_ref_idx);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++ }
++
++ if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) ||
++ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
++ {
++ if ((ret = pred_weight_table(s, gb)) != 0)
++ return ret;
++ }
++ else
++ {
++ // Give us unit weights
++ default_pred_weight_table(s);
++ }
++
++ sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++ if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid number of merging MVP candidates: %d.\n",
++ sh->max_num_merge_cand);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ sh->slice_qp_delta = get_se_golomb(gb);
++
++ if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
++ sh->slice_cb_qp_offset = get_se_golomb(gb);
++ sh->slice_cr_qp_offset = get_se_golomb(gb);
++ if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
++ !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
++ !qp_offset_valid(sh->slice_cr_qp_offset) ||
++ !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
++ sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
++ s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
++ return AVERROR_INVALIDDATA;
++ }
++ } else
++ {
++ sh->slice_cb_qp_offset = 0;
++ sh->slice_cr_qp_offset = 0;
++ }
++
++ if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
++ sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
++ else
++ sh->cu_chroma_qp_offset_enabled_flag = 0;
++
++ if (s->ps.pps->deblocking_filter_control_present_flag) {
++ int deblocking_filter_override_flag = 0;
++
++ if (s->ps.pps->deblocking_filter_override_enabled_flag)
++ deblocking_filter_override_flag = get_bits1(gb);
++
++ if (deblocking_filter_override_flag) {
++ sh->disable_deblocking_filter_flag = get_bits1(gb);
++ if (!sh->disable_deblocking_filter_flag) {
++ int beta_offset_div2 = get_se_golomb(gb);
++ int tc_offset_div2 = get_se_golomb(gb) ;
++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
++ tc_offset_div2 < -6 || tc_offset_div2 > 6) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid deblock filter offsets: %d, %d\n",
++ beta_offset_div2, tc_offset_div2);
++ return AVERROR_INVALIDDATA;
++ }
++ sh->beta_offset = beta_offset_div2 * 2;
++ sh->tc_offset = tc_offset_div2 * 2;
++ }
++ } else {
++ sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
++ sh->beta_offset = s->ps.pps->beta_offset;
++ sh->tc_offset = s->ps.pps->tc_offset;
++ }
++ } else {
++ sh->disable_deblocking_filter_flag = 0;
++ sh->beta_offset = 0;
++ sh->tc_offset = 0;
++ }
++
++ if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
++ (sh->slice_sample_adaptive_offset_flag[0] ||
++ sh->slice_sample_adaptive_offset_flag[1] ||
++ !sh->disable_deblocking_filter_flag)) {
++ sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++ } else {
++ sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
++ }
++ sh->no_dblk_boundary_flags =
++ (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
++ BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
++ (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
++ BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
++
++
++ } else if (!s->slice_initialized) {
++ av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->num_entry_point_offsets = 0;
++ sh->offload_wpp = 0;
++ sh->offload_tiles = 0;
++
++ if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
++ unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
++ // It would be possible to bound this tighter but this here is simpler
++ if (num_entry_point_offsets > get_bits_left(gb)) {
++ av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->num_entry_point_offsets = num_entry_point_offsets;
++ if (sh->num_entry_point_offsets > 0) {
++ int offset_len = get_ue_golomb_long(gb) + 1;
++
++ if (offset_len < 1 || offset_len > 32) {
++ sh->num_entry_point_offsets = 0;
++ av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
++ return ret;
++ }
++
++ for (i = 0; i < sh->num_entry_point_offsets; i++) {
++ uint32_t val_minus1 = get_bits_long(gb, offset_len);
++ if (val_minus1 > (1 << 28))
++ {
++ // We can declare offsets of > 2^28 bad without loss of generality
++ // Will check actual bounds wrt NAL later, but this keeps
++ // the values within bounds we can deal with easily
++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
++ return AVERROR_INVALIDDATA;
++ }
++ sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
++ }
++
++ // Do we want to offload this
++ if (s->threads_type != 0)
++ {
++ sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
++ s->ps.pps->num_tile_columns > 1;
++ // * We only cope with WPP in a single column
++ // Probably want to deal with that case as tiles rather than WPP anyway
++ // ?? Not actually sure that the main code deals with WPP + multi-col correctly
++ sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
++ s->ps.pps->num_tile_columns == 1;
++ }
++ }
++ }
++
++ if (s->ps.pps->slice_header_extension_present_flag) {
++ unsigned int length = get_ue_golomb_long(gb);
++ if (length*8LL > get_bits_left(gb)) {
++ av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
++ return AVERROR_INVALIDDATA;
++ }
++ for (i = 0; i < length; i++)
++ skip_bits(gb, 8); // slice_header_extension_data_byte
++ }
++
++ // Inferred parameters
++ sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
++ if (sh->slice_qp > 51 ||
++ sh->slice_qp < -s->ps.sps->qp_bd_offset) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "The slice_qp %d is outside the valid range "
++ "[%d, 51].\n",
++ sh->slice_qp,
++ -s->ps.sps->qp_bd_offset);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (get_bits_left(gb) < 0) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Overread slice header by %d bits\n", -get_bits_left(gb));
++ return AVERROR_INVALIDDATA;
++ }
++
++ s->slice_initialized = 1;
++ return 0;
++}
++
++static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
++{
++ RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
++ int c_idx, i;
++
++ if (s->sh.slice_sample_adaptive_offset_flag[0] ||
++ s->sh.slice_sample_adaptive_offset_flag[1]) {
++ if ((lc->ctb_avail & AVAIL_L) != 0)
++ {
++ const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++ if (sao_merge_left_flag) {
++ *sao = sao[-1];
++ return;
++ }
++ }
++ if ((lc->ctb_avail & AVAIL_U) != 0)
++ {
++ const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++ if (sao_merge_up_flag) {
++ *sao = sao[-(int)s->ps.sps->ctb_width];
++ return;
++ }
++ }
++ }
++
++ for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
++ const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
++ s->ps.pps->log2_sao_offset_scale_chroma;
++ int offset_abs[4];
++ char offset_sign[4] = {0};
++
++ if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
++ sao->type_idx[c_idx] = SAO_NOT_APPLIED;
++ continue;
++ }
++
++ if (c_idx == 2) {
++ sao->type_idx[2] = sao->type_idx[1];
++ sao->eo_class[2] = sao->eo_class[1];
++ } else {
++ sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
++ }
++
++ // ** Could use BY22 here quite plausibly - this is all bypass stuff
++ // though only per CTB so not very timing critical
++
++ if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
++ continue;
++
++ for (i = 0; i < 4; i++)
++ offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
++
++ if (sao->type_idx[c_idx] == SAO_BAND) {
++ for (i = 0; i < 4; i++) {
++ if (offset_abs[i] != 0)
++ offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
++ }
++ sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
++ } else if (c_idx != 2) {
++ sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
++ }
++
++ // Inferred parameters
++ sao->offset_val[c_idx][0] = 0;
++ for (i = 0; i < 4; i++) {
++ sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
++ if (sao->type_idx[c_idx] == SAO_EDGE) {
++ if (i > 1)
++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++ } else if (offset_sign[i]) {
++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++ }
++ }
++ }
++}
++
++#if 0
++static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
++ int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx); // 0..4
++
++ if (log2_res_scale_abs_plus1 != 0) {
++ int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
++ lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
++ (1 - 2 * res_scale_sign_flag);
++ } else {
++ lc->tu.res_scale_val = 0;
++ }
++
++
++ return 0;
++}
++#endif
++
++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
++{
++ return jb->intra.cmds + jb->intra.n++;
++}
++
++#define A0(x, y, U, L, UL, UR, DL) \
++ [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
++
++#define A1(x, y, U, L, UL, UR, DL) \
++ A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 )
++
++#define A2(x, y, U, L, UL, UR, DL) \
++ A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 )
++
++#define A3(x, y, U, L, UL, UR, DL) \
++ A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 )
++
++#define A4(x, y, U, L, UL, UR, DL) \
++ A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\
++ A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 )
++
++static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
++
++unsigned int ff_hevc_rpi_tb_avail_flags(
++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
++{
++ const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
++ const unsigned int tb_x = x & ~ctb_mask;
++ const unsigned int tb_y = y & ~ctb_mask;
++ const unsigned int ctb_avail = lc->ctb_avail;
++
++ const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
++
++ unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
++
++ // This deals with both the U & L edges
++ if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
++ f |= AVAIL_UL;
++
++ if (x + w < lc->end_of_ctb_x)
++ f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
++ else if (tb_y == 0)
++ f |= (ctb_avail & AVAIL_UR);
++#if AVAIL_S_U - AVAIL_S_UR < 0
++#error Shift problem
++#endif
++
++ // Never any D if Y beyond eoctb
++ if (y + h < lc->end_of_ctb_y)
++ f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
++#if AVAIL_S_DL - AVAIL_S_L < 0
++#error Shift problem
++#endif
++
++// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
++// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
++// lc->end_of_ctb_x, lc->end_of_ctb_y);
++
++ return f;
++}
++
++#undef A0
++#undef A1
++#undef A2
++#undef A3
++#undef A4
++
++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
++ unsigned int avail)
++{
++ // If rpi_enabled then sand - U & V done on U call
++ if (c_idx <= 1)
++ {
++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++ cmd->type = RPI_PRED_INTRA + c_idx;
++ cmd->size = log2_trafo_size;
++ cmd->avail = avail;
++ cmd->i_pred.x = x0;
++ cmd->i_pred.y = y0;
++ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
++
++// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
++ }
++}
++
++#define CBF_CB0_S 0
++#define CBF_CB1_S 1 // CB1 must be CB0 + 1
++#define CBF_CR0_S 2
++#define CBF_CR1_S 3
++
++#define CBF_CB0 (1 << CBF_CB0_S)
++#define CBF_CR0 (1 << CBF_CR0_S)
++#define CBF_CB1 (1 << CBF_CB1_S)
++#define CBF_CR1 (1 << CBF_CR1_S)
++
++// * Only good for chroma_idx == 1
++static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
++ const unsigned int blk_idx, const int cbf_luma,
++ const unsigned int cbf_chroma)
++{
++ const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
++ const unsigned int x0_c = x0 & ~7;
++ const unsigned int y0_c = y0 & ~7;
++
++ enum ScanType scan_idx = SCAN_DIAG;
++ enum ScanType scan_idx_c = SCAN_DIAG;
++
++ if (lc->cu.pred_mode == MODE_INTRA)
++ {
++ const unsigned int trafo_size = 1 << log2_trafo_size;
++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
++
++ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
++
++ if (log2_trafo_size > 2)
++ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
++ else if (blk_idx == 3)
++ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
++ ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
++
++ if (log2_trafo_size < 4) {
++ if (lc->tu.intra_pred_mode >= 6 &&
++ lc->tu.intra_pred_mode <= 14) {
++ scan_idx = SCAN_VERT;
++ } else if (lc->tu.intra_pred_mode >= 22 &&
++ lc->tu.intra_pred_mode <= 30) {
++ scan_idx = SCAN_HORIZ;
++ }
++
++ if (lc->tu.intra_pred_mode_c >= 6 &&
++ lc->tu.intra_pred_mode_c <= 14) {
++ scan_idx_c = SCAN_VERT;
++ } else if (lc->tu.intra_pred_mode_c >= 22 &&
++ lc->tu.intra_pred_mode_c <= 30) {
++ scan_idx_c = SCAN_HORIZ;
++ }
++ }
++ }
++
++ if (!cbf_luma && cbf_chroma == 0)
++ return 0;
++
++ if (lc->tu.is_cu_qp_delta_wanted)
++ {
++ const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
++ const unsigned int cb_mask = ~0U << log2_cb_size;
++
++ if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
++ qp_delta > (25 + (s->ps.sps->qp_bd_offset >> 1)))
++ {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "The cu_qp_delta %d is outside the valid range "
++ "[%d, %d].\n",
++ qp_delta,
++ -(26 + (s->ps.sps->qp_bd_offset >> 1)),
++ (25 + (s->ps.sps->qp_bd_offset >> 1)));
++ return AVERROR_INVALIDDATA;
++ }
++
++ lc->tu.is_cu_qp_delta_wanted = 0;
++ lc->tu.cu_qp_delta = qp_delta;
++ ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
++ }
++
++ // * Not main profile & untested due to no conform streams
++ if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
++ !lc->cu.cu_transquant_bypass_flag) {
++ int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
++ if (cu_chroma_qp_offset_flag) {
++ int cu_chroma_qp_offset_idx = 0;
++ if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
++ cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
++ }
++ lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
++ lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
++ }
++ lc->tu.cu_chroma_qp_offset_wanted = 0;
++ }
++
++ if (cbf_luma)
++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
++
++ if (log2_trafo_size > 2 || blk_idx == 3)
++ {
++ if ((cbf_chroma & CBF_CB0) != 0)
++ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
++ log2_trafo_size_c, scan_idx_c, 1);
++ if ((cbf_chroma & CBF_CR0) != 0)
++ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
++ log2_trafo_size_c, scan_idx_c, 2);
++ }
++
++ return 0;
++}
++
++static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
++{
++ set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
++}
++
++
++static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_trafo_size,
++ const unsigned int trafo_depth, const unsigned int blk_idx,
++ const unsigned int cbf_c0)
++{
++ // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
++ unsigned int cbf_c1 = cbf_c0;
++ int split_transform_flag;
++ int ret;
++
++ if (lc->cu.intra_split_flag) {
++ if (trafo_depth == 1) {
++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[blk_idx];
++ if (ctx_cfmt(s) == 3) {
++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[blk_idx];
++ } else {
++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0];
++ }
++ }
++ } else {
++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[0];
++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0];
++ }
++
++ if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
++ log2_trafo_size > s->ps.sps->log2_min_tb_size &&
++ trafo_depth < lc->cu.max_trafo_depth &&
++ !(lc->cu.intra_split_flag && trafo_depth == 0))
++ {
++ split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
++ } else {
++ int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
++ lc->cu.pred_mode == MODE_INTER &&
++ lc->cu.part_mode != PART_2Nx2N &&
++ trafo_depth == 0;
++
++ split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
++ (lc->cu.intra_split_flag && trafo_depth == 0) ||
++ inter_split;
++ }
++
++ if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
++ {
++ const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
++ cbf_c1 = 0;
++
++ if ((cbf_c0 & CBF_CB0) != 0)
++ {
++ cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
++ if (wants_c1)
++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
++ }
++
++ if ((cbf_c0 & CBF_CR0) != 0)
++ {
++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
++ if (wants_c1)
++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
++ }
++ }
++
++ if (split_transform_flag) {
++ const int trafo_size_split = 1 << (log2_trafo_size - 1);
++ const int x1 = x0 + trafo_size_split;
++ const int y1 = y0 + trafo_size_split;
++
++#define SUBDIVIDE(x, y, idx) \
++do { \
++ ret = hls_transform_tree(s, lc, x, y, \
++ log2_trafo_size - 1, trafo_depth + 1, idx, \
++ cbf_c1); \
++ if (ret < 0) \
++ return ret; \
++} while (0)
++
++ SUBDIVIDE(x0, y0, 0);
++ SUBDIVIDE(x1, y0, 1);
++ SUBDIVIDE(x0, y1, 2);
++ SUBDIVIDE(x1, y1, 3);
++
++#undef SUBDIVIDE
++ } else {
++ // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
++ // trafo_size == 2 with depth == 0 the issue is moot
++ const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
++ ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
++
++ ret = hls_transform_unit(s, lc, x0, y0,
++ log2_trafo_size + trafo_depth, log2_trafo_size,
++ blk_idx, cbf_luma, cbf_c1);
++ if (ret < 0)
++ return ret;
++
++ if (!s->sh.disable_deblocking_filter_flag) {
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
++ }
++ }
++ return 0;
++}
++
++
++static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++{
++ GetBitContext gb;
++ int ret;
++
++ ret = init_get_bits(&gb, pcm, length);
++ if (ret < 0)
++ return ret;
++
++ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
++ frame_stride1(s->frame, 0),
++ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++
++ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
++ s->frame->linesize[1],
++ cb_size >> ctx_hshift(s, 1),
++ cb_size >> ctx_vshift(s, 1),
++ &gb, s->ps.sps->pcm.bit_depth_chroma);
++
++ return 0;
++}
++
++
++// x * 2^(y*2)
++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
++{
++ return x << (y * 2);
++}
++
++static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
++{
++ // Length in bits
++ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
++
++ const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
++
++ if (!s->sh.disable_deblocking_filter_flag)
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++
++ // Copy coeffs
++ {
++ const int blen = (length + 7) >> 3;
++ // Round allocated bytes up to nearest 32 to avoid alignment confusion
++ // Allocation is in int16_t s
++ // As we are only using 1 byte per sample and the coeff buffer allows 2 per
++ // sample this rounding doesn't affect the total size we need to allocate for
++ // the coeff buffer
++ int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
++ memcpy(coeffs, pcm, blen);
++
++ // Our coeff stash assumes that any partially allocated 64byte lump
++ // is zeroed so make that true.
++ {
++ uint8_t * const eopcm = (uint8_t *)coeffs + blen;
++ if ((-(intptr_t)eopcm & 63) != 0)
++ memset(eopcm, 0, -(intptr_t)eopcm & 63);
++ }
++
++ // Add command
++ {
++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++ cmd->type = RPI_PRED_I_PCM;
++ cmd->size = log2_cb_size;
++ cmd->i_pcm.src = coeffs;
++ cmd->i_pcm.x = x0;
++ cmd->i_pcm.y = y0;
++ cmd->i_pcm.src_len = length;
++ }
++ return 0;
++ }
++}
++
++
++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
++ const MvXY xy, const int y0, const int height)
++{
++ if (s->threads_type != 0) {
++ const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
++
++ // Progress has to be attached to current job as the actual wait
++ // is in worker_core which can't use lc
++ int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
++ if (*pr < y) {
++ *pr = y;
++ }
++ }
++}
++
++static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0, const int nPbW,
++ const int nPbH,
++ HEVCRpiMvField * const mv)
++{
++ enum InterPredIdc inter_pred_idc = PRED_L0;
++ int mvp_flag;
++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
++
++ mv->pred_flag = 0;
++ if (s->sh.slice_type == HEVC_SLICE_B)
++ inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
++
++ if (inter_pred_idc != PRED_L1) {
++ MvXY mvd;
++
++ if (s->sh.nb_refs[L0])
++ mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
++
++ mv->pred_flag = PF_L0;
++ mvd = ff_hevc_rpi_hls_mvd_coding(lc);
++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++ mv, mvp_flag, 0);
++ mv->xy[0] = mvxy_add(mv->xy[0], mvd);
++ }
++
++ if (inter_pred_idc != PRED_L0) {
++ MvXY mvd = 0;
++
++ if (s->sh.nb_refs[L1])
++ mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
++
++ if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
++ mvd = ff_hevc_rpi_hls_mvd_coding(lc);
++
++ mv->pred_flag += PF_L1;
++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++ mv, mvp_flag, 1);
++ mv->xy[1] = mvxy_add(mv->xy[1], mvd);
++ }
++}
++
++
++static HEVCRpiInterPredQ *
++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
++{
++ HEVCRpiInterPredQ * yp = NULL;
++ HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
++ const unsigned int max_fill = ipe->max_fill;
++ unsigned int load = UINT_MAX;
++
++ for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
++ // We will always have enough room between the Qs but if we are
++ // running critically low due to poor scheduling then use fill size
++ // rather than load to determine QPU. This has obvious dire
++ // performance implications but (a) it is better than crashing
++ // and (b) it should (almost) never happen
++ const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
++ const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
++
++ if (tload < load)
++ {
++ yp = ypt;
++ load = tload;
++ }
++ }
++
++ yp->load += load_val;
++ ipe->used_grp = 1;
++ qpu_mc_link_set(yp->qpu_mc_curr, fn);
++
++ return yp;
++}
++
++
++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
++{
++ for (unsigned int i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const q = ipe->q + i;
++ const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
++
++ qpu_mc_link_set(q->qpu_mc_curr, q->code_sync);
++ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1);
++ q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
++ }
++}
++
++// Returns 0 on success
++// We no longer check for Q fullness as wew have emergncy code in ctu alloc
++// * However it might be an idea to have some means of spotting that we've used it
++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
++{
++ if (!ipe->used_grp)
++ return 0;
++
++ if ((ipe->curr += ipe->n_grp) >= ipe->n)
++ {
++ ipe->curr = 0;
++ rpi_inter_pred_sync(ipe);
++ }
++ ipe->used = 1;
++ ipe->used_grp = 0;
++
++ return 0;
++}
++
++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
++{
++ unsigned int i;
++
++ ipe->curr = 0;
++ ipe->used = 0;
++ ipe->used_grp = 0;
++ for (i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const q = ipe->q + i;
++ q->qpu_mc_curr = q->qpu_mc_base;
++ q->load = 0;
++ q->last_l0 = NULL;
++ q->last_l1 = NULL;
++ }
++}
++
++static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
++ const unsigned int n_max, const unsigned int n_grp,
++ const unsigned int total_size, const unsigned int min_gap)
++{
++ int rv;
++
++ memset(ipe, 0, sizeof(*ipe));
++ if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL)
++ return AVERROR(ENOMEM);
++
++ ipe->n_grp = n_grp;
++ ipe->min_gap = min_gap;
++
++ if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0)
++ av_freep(&ipe->q);
++ return rv;
++}
++
++
++#if RPI_QPU_EMU_Y
++#define get_mc_address_y(f) ((f)->data[0])
++#else
++#define get_mc_address_y(f) get_vc_address_y(f)
++#endif
++#if RPI_QPU_EMU_C
++#define get_mc_address_u(f) ((f)->data[1])
++#else
++#define get_mc_address_u(f) get_vc_address_u(f)
++#endif
++
++static inline uint32_t pack_wo_p(const int off, const int mul)
++{
++ return PACK2(off * 2 + 1, mul);
++}
++
++static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
++{
++ return PACK2(off0 + off1 + 1, mul);
++}
++
++
++static void
++rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++ const int x0, const int y0,
++ const int nPbW, const int nPbH,
++ const MvXY mv_xy,
++ const int weight_mul,
++ const int weight_offset,
++ AVFrame *const src_frame)
++{
++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++ const unsigned int mx = MV_X(mv_xy) & 3;
++ const unsigned int my = MV_Y(mv_xy) & 3;
++ const unsigned int my_mx = (my << 8) | mx;
++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx;
++ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
++ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
++ const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++
++ if (my_mx == 0)
++ {
++ const int x1 = x0 + (MV_X(mv_xy) >> 2);
++ const int y1 = y0 + (MV_Y(mv_xy) >> 2);
++ const int bh = nPbH;
++
++ for (int start_x = 0; start_x < nPbW; start_x += 16)
++ {
++ const int bw = FFMIN(nPbW - start_x, 16);
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
++
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++ ++ts->y_pred1_x0y0;
++
++ if (nPbW > 8)
++ ++ts->y_pred1_wgt8;
++ else
++ ++ts->y_pred1_wle8;
++
++ if (nPbH > 16)
++ ++ts->y_pred1_hgt16;
++ else
++ ++ts->y_pred1_hle16;
++ }
++#endif
++
++ src1->x = x1 + start_x;
++ src1->y = y1;
++ src1->base = src_vc_address_y;
++ cmd_y->w = bw;
++ cmd_y->h = bh;
++ cmd_y->wo1 = wo;
++ cmd_y->dst_addr = dst_addr + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++ }
++ }
++ else
++ {
++ const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
++ const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
++ const unsigned int bh = nPbH;
++ int start_x = 0;
++
++#if 1
++ // As Y-pred operates on two independant 8-wide src blocks we can merge
++ // this pred with the previous one if it the previous one is 8 pel wide,
++ // the same height as the current block, immediately to the left of our
++ // current dest block and mono-pred.
++
++ qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
++ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
++ {
++ const int bw = FFMIN(nPbW, 8);
++ qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
++
++ last_y8_src2->x = x1_m3;
++ last_y8_src2->y = y1_m3;
++ last_y8_src2->base = src_vc_address_y;
++ last_y8_p->w += bw;
++ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++ last_y8_p->wo2 = wo;
++
++ jb->last_y8_p = NULL;
++ jb->last_y8_l1 = NULL;
++ start_x = bw;
++#if RPI_TSTATS
++ ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
++#endif
++ }
++#endif
++
++ for (; start_x < nPbW; start_x += 16)
++ {
++ const int bw = FFMIN(nPbW - start_x, 16);
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_src_t *const src2 = yp->last_l1;
++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++ if (mx == 0 && my == 0)
++ ++ts->y_pred1_x0y0;
++ else if (mx == 0)
++ ++ts->y_pred1_x0;
++ else if (my == 0)
++ ++ts->y_pred1_y0;
++ else
++ ++ts->y_pred1_xy;
++
++ if (nPbW > 8)
++ ++ts->y_pred1_wgt8;
++ else
++ ++ts->y_pred1_wle8;
++
++ if (nPbH > 16)
++ ++ts->y_pred1_hgt16;
++ else
++ ++ts->y_pred1_hle16;
++ }
++#endif
++ src1->x = x1_m3 + start_x;
++ src1->y = y1_m3;
++ src1->base = src_vc_address_y;
++ if (bw <= 8)
++ {
++ src2->x = MC_DUMMY_X;
++ src2->y = MC_DUMMY_Y;
++#if RPI_QPU_EMU_Y
++ src2->base = s->qpu_dummy_frame_emu;
++#else
++ src2->base = s->qpu_dummy_frame_qpu;
++#endif
++ }
++ else
++ {
++ src2->x = x1_m3 + start_x + 8;
++ src2->y = y1_m3;
++ src2->base = src_vc_address_y;
++ }
++ cmd_y->w = bw;
++ cmd_y->h = bh;
++ cmd_y->mymx21 = my2_mx2_my_mx;
++ cmd_y->wo1 = wo;
++ cmd_y->wo2 = wo;
++ cmd_y->dst_addr = dst_addr + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->last_l1 = &cmd_y->next_src2;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++
++ if (bw == 8) {
++ jb->last_y8_l1 = src2;
++ jb->last_y8_p = cmd_y;
++ }
++ }
++ }
++}
++
++static void
++rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const int x0, const int y0,
++ const int nPbW, const int nPbH,
++ const struct HEVCRpiMvField *const mv_field,
++ const AVFrame *const src_frame,
++ const AVFrame *const src_frame2)
++{
++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++ const MvXY mv = mv_field->xy[0];
++ const MvXY mv2 = mv_field->xy[1];
++
++ const unsigned int mx = MV_X(mv) & 3;
++ const unsigned int my = MV_Y(mv) & 3;
++ const unsigned int my_mx = (my<<8) | mx;
++ const unsigned int mx2 = MV_X(mv2) & 3;
++ const unsigned int my2 = MV_Y(mv2) & 3;
++ const unsigned int my2_mx2 = (my2<<8) | mx2;
++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++ const unsigned int ref_idx0 = mv_field->ref_idx[0];
++ const unsigned int ref_idx1 = mv_field->ref_idx[1];
++ const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
++ const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
++
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
++ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
++ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++
++ if (my2_mx2_my_mx == 0)
++ {
++ const int x1 = x0 + (MV_X(mv) >> 2);
++ const int y1 = y0 + (MV_Y(mv) >> 2);
++ const int x2 = x0 + (MV_X(mv2) >> 2);
++ const int y2 = y0 + (MV_Y(mv2) >> 2);
++ const int bh = nPbH;
++
++ // Can do chunks a full 16 wide if we don't want the H filter
++ for (int start_x=0; start_x < nPbW; start_x += 16)
++ {
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_src_t *const src2 = yp->last_l1;
++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++ ++ts->y_pred2_x0y0;
++
++ if (nPbH > 16)
++ ++ts->y_pred2_hgt16;
++ else
++ ++ts->y_pred2_hle16;
++ }
++#endif
++ src1->x = x1 + start_x;
++ src1->y = y1;
++ src1->base = src1_base;
++ src2->x = x2 + start_x;
++ src2->y = y2;
++ src2->base = src2_base;
++ cmd_y->w = FFMIN(nPbW - start_x, 16);
++ cmd_y->h = bh;
++ cmd_y->mymx21 = 0;
++ cmd_y->wo1 = wo1;
++ cmd_y->wo2 = wo2;
++ cmd_y->dst_addr = dst + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->last_l1 = &cmd_y->next_src2;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++ }
++ }
++ else
++ {
++ // Filter requires a run-up of 3
++ const int x1 = x0 + (MV_X(mv) >> 2) - 3;
++ const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
++ const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
++ const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
++ const int bh = nPbH;
++
++ for (int start_x=0; start_x < nPbW; start_x += 8)
++ { // B blocks work 8 at a time
++ // B weights aren't doubled as the QPU code does the same
++ // amount of work as it does for P
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_src_t *const src2 = yp->last_l1;
++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++ const unsigned int mmx = mx | mx2;
++ const unsigned int mmy = my | my2;
++ if (mmx == 0 && mmy == 0)
++ ++ts->y_pred2_x0y0;
++ else if (mmx == 0)
++ ++ts->y_pred2_x0;
++ else if (mmy == 0)
++ ++ts->y_pred2_y0;
++ else
++ ++ts->y_pred2_xy;
++
++ if (nPbH > 16)
++ ++ts->y_pred2_hgt16;
++ else
++ ++ts->y_pred2_hle16;
++ }
++#endif
++ src1->x = x1 + start_x;
++ src1->y = y1;
++ src1->base = src1_base;
++ src2->x = x2 + start_x;
++ src2->y = y2;
++ src2->base = src2_base;
++ cmd_y->w = FFMIN(nPbW - start_x, 8);
++ cmd_y->h = bh;
++ cmd_y->mymx21 = my2_mx2_my_mx;
++ cmd_y->wo1 = wo1;
++ cmd_y->wo2 = wo2;
++ cmd_y->dst_addr = dst + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->last_l1 = &cmd_y->next_src2;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++ }
++ }
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const unsigned int lx, const int x0_c, const int y0_c,
++ const int nPbW_c, const int nPbH_c,
++ const MvXY mv,
++ const int16_t * const c_weights,
++ const int16_t * const c_offsets,
++ AVFrame * const src_frame)
++{
++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++ const int hshift = 1; // = s->ps.sps->hshift[1];
++ const int vshift = 1; // = s->ps.sps->vshift[1];
++
++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
++ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
++ const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
++ const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
++ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++ const unsigned int bh = nPbH_c;
++ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
++
++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
++ {
++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
++ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
++ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
++ qpu_mc_src_t * const last_lx = *plast_lx;
++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++ last_lx->x = x1_c + start_x;
++ last_lx->y = y1_c;
++ last_lx->base = src_base_u;
++ cmd_c->h = bh;
++ cmd_c->w = bw;
++ cmd_c->coeffs_x = x_coeffs;
++ cmd_c->coeffs_y = y_coeffs;
++ cmd_c->wo_u = wo_u;
++ cmd_c->wo_v = wo_v;
++ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
++ *plast_lx = &cmd_c->next_src;
++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
++ }
++ return;
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const int x0_c, const int y0_c,
++ const int nPbW_c, const int nPbH_c,
++ const struct HEVCRpiMvField * const mv_field,
++ const int16_t * const c_weights,
++ const int16_t * const c_offsets,
++ const int16_t * const c_weights2,
++ const int16_t * const c_offsets2,
++ AVFrame * const src_frame,
++ AVFrame * const src_frame2)
++{
++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++ const int hshift = 1; // s->ps.sps->hshift[1];
++ const int vshift = 1; // s->ps.sps->vshift[1];
++ const MvXY mv = mv_field->xy[0];
++ const MvXY mv2 = mv_field->xy[1];
++
++ const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
++ const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
++
++ const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
++ const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++ const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
++ const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
++
++ const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
++ const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
++
++ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
++ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++ const unsigned int bh = nPbH_c;
++
++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
++ {
++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
++ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++ qpu_mc_src_t * const src_l0 = cp->last_l0;
++ qpu_mc_src_t * const src_l1 = cp->last_l1;
++
++ src_l0->x = x1_c + start_x;
++ src_l0->y = y1_c;
++ src_l0->base = src1_base;
++ src_l1->x = x2_c + start_x;
++ src_l1->y = y2_c;
++ src_l1->base = src2_base;
++
++ u[0].h = bh;
++ u[0].w = bw;
++ u[0].coeffs_x1 = coefs0_x;
++ u[0].coeffs_y1 = coefs0_y;
++ u[0].weight_u1 = c_weights[0]; // Weight L0 U
++ u[0].weight_v1 = c_weights[1]; // Weight L0 V
++ u[0].coeffs_x2 = coefs1_x;
++ u[0].coeffs_y2 = coefs1_y;
++ u[0].wo_u2 = wo_u2;
++ u[0].wo_v2 = wo_v2;
++ u[0].dst_addr_c = dst_base_u + (start_x << xshl);
++
++ cp->last_l0 = &u[0].next_src1;
++ cp->last_l1 = &u[0].next_src2;
++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++ }
++}
++
++
++static inline void
++col_stash(const HEVCRpiContext * const s,
++ const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
++ const HEVCRpiMvField * const mvf)
++{
++ ColMvField * const col_mvf = s->ref->col_mvf;
++ const unsigned int x = (x0 + 15) >> 4;
++ const unsigned int y = (y0 + 15) >> 4;
++ const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
++ const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
++
++ if (col_mvf != NULL && w != 0 && h != 0)
++ {
++ // Only record MV from the top left of the 16x16 block
++
++ const RefPicList * const rpl = s->refPicList;
++ const ColMvField cmv = {
++ .L = {
++ {
++ .poc = (mvf->pred_flag & PF_L0) == 0 ?
++ COL_POC_INTRA :
++ COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
++ .xy = mvf->xy[0]
++ },
++ {
++ .poc = (mvf->pred_flag & PF_L1) == 0 ?
++ COL_POC_INTRA :
++ COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
++ .xy = mvf->xy[1]
++ }
++ }
++ };
++
++ ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
++ const unsigned int stride = s->col_mvf_stride - w;
++ unsigned int j = h;
++
++ do
++ {
++ unsigned int k = w;
++ do
++ {
++ *p++ = cmv;
++ } while (--k != 0);
++ p += stride;
++ } while (--j != 0);
++ }
++}
++
++static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int nPbW, const unsigned int nPbH,
++ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
++{
++ HEVCRpiJob * const jb = lc->jb0;
++
++ struct HEVCRpiMvField current_mv = {{0}};
++ const RefPicList *const refPicList = s->refPicList;
++ const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
++
++ if (lc->cu.pred_mode != MODE_SKIP)
++ lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
++
++ if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
++ const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
++ ff_hevc_rpi_merge_idx_decode(s, lc);
++
++ ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++ partIdx, merge_idx, &current_mv);
++ } else {
++ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
++ }
++
++ {
++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++ unsigned int i, j;
++
++ for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
++ {
++ for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
++ p[i] = current_mv;
++ p += MVF_STASH_WIDTH_PU;
++ }
++ }
++
++ col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
++
++ if (current_mv.pred_flag & PF_L0) {
++ ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
++ if (!ref0)
++ return;
++ hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
++ }
++ if (current_mv.pred_flag & PF_L1) {
++ ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
++ if (!ref1)
++ return;
++ hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
++ }
++
++ if (current_mv.pred_flag == PF_L0) {
++ const int x0_c = x0 >> ctx_hshift(s, 1);
++ const int y0_c = y0 >> ctx_vshift(s, 1);
++ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
++ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
++ ref0->frame);
++
++ if (ctx_cfmt(s) != 0) {
++ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++ ref0->frame);
++ return;
++ }
++ } else if (current_mv.pred_flag == PF_L1) {
++ const int x0_c = x0 >> ctx_hshift(s, 1);
++ const int y0_c = y0 >> ctx_vshift(s, 1);
++ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
++ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
++ ref1->frame);
++
++ if (ctx_cfmt(s) != 0) {
++ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++ ref1->frame);
++ return;
++ }
++ } else if (current_mv.pred_flag == PF_BI) {
++ const int x0_c = x0 >> ctx_hshift(s, 1);
++ const int y0_c = y0 >> ctx_vshift(s, 1);
++ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++ rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
++
++ if (ctx_cfmt(s) != 0) {
++ rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
++ &current_mv,
++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
++ s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
++ s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++ ref0->frame,
++ ref1->frame);
++ return;
++ }
++ }
++}
++
++static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_cb_size,
++ const unsigned int ipm)
++{
++ const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
++ const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
++
++ {
++ const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
++ set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
++ }
++
++ // If IRAP then everything is Intra & we avoid ever looking at these
++ // stashes so don't bother setting them
++ if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
++ {
++ if (s->is_intra != NULL)
++ {
++ set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
++ }
++
++ {
++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++ const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
++ unsigned int n = size_in_pus;
++
++ do
++ {
++ memset(p, 0, size_in_pus * sizeof(*p));
++ p += MVF_STASH_WIDTH_PU;
++ } while (--n != 0);
++ }
++
++
++ if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
++ {
++ // Only record top left stuff
++ // Blocks should always be alinged on size boundries
++ // so cannot have overflow from a small block
++
++ ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
++ const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
++ const unsigned int stride = s->col_mvf_stride - size_in_col;
++ unsigned int j = size_in_col;
++
++ do
++ {
++ unsigned int k = size_in_col;
++ do
++ {
++ p->L[0].poc = COL_POC_INTRA;
++ p->L[0].xy = 0;
++ p->L[1].poc = COL_POC_INTRA;
++ p->L[1].xy = 0;
++ ++p;
++ } while (--k != 0);
++ p += stride;
++ } while (--j != 0);
++ }
++ }
++}
++
++static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_cb_size)
++{
++ set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
++}
++
++
++/**
++ * 8.4.1
++ */
++static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ int x0, int y0, int log2_pu_size,
++ int prev_intra_luma_pred_flag,
++ const unsigned int idx)
++{
++ const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
++ const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
++ const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
++
++ // Up does not cross boundries so as we always scan 1 slice-tile-line in an
++ // lc we can just keep 1 CTB lR stashes
++ // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
++ const unsigned int cand_up = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
++ const unsigned int cand_left = lc->ipm_left[yb_pu];
++
++ unsigned int intra_pred_mode;
++ unsigned int a, b, c;
++
++ if (cand_left == cand_up) {
++ if (cand_left < 2) {
++ a = INTRA_PLANAR;
++ b = INTRA_DC;
++ c = INTRA_ANGULAR_26;
++ } else {
++ a = cand_left;
++ b = 2 + ((cand_left - 2 - 1 + 32) & 31);
++ c = 2 + ((cand_left - 2 + 1) & 31);
++ }
++ } else {
++ a = cand_left;
++ b = cand_up;
++ c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
++ INTRA_PLANAR :
++ (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
++ INTRA_DC :
++ INTRA_ANGULAR_26;
++ }
++
++ if (prev_intra_luma_pred_flag) {
++ intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
++ } else {
++ // Sort lowest 1st
++ if (a > b)
++ FFSWAP(int, a, b);
++ if (a > c)
++ FFSWAP(int, a, c);
++ if (b > c)
++ FFSWAP(int, b, c);
++
++ intra_pred_mode = idx;
++ if (intra_pred_mode >= a)
++ intra_pred_mode++;
++ if (intra_pred_mode >= b)
++ intra_pred_mode++;
++ if (intra_pred_mode >= c)
++ intra_pred_mode++;
++ }
++
++ /* write the intra prediction units into the mv array */
++ set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
++ return intra_pred_mode;
++}
++
++static const uint8_t tab_mode_idx[] = {
++ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20,
++ 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
++
++static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_cb_size)
++{
++ static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
++ uint8_t prev_intra_luma_pred_flag[4];
++ int split = lc->cu.part_mode == PART_NxN;
++ const unsigned int split_size = (1 << (log2_cb_size - 1));
++ int chroma_mode;
++ const unsigned int n = split ? 4 : 1;
++ unsigned int i;
++
++ for (i = 0; i != n; i++)
++ prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
++
++ for (i = 0; i < n; i++) {
++ // depending on mode idx is mpm or luma_pred_mode
++ const unsigned int idx = prev_intra_luma_pred_flag[i] ?
++ ff_hevc_rpi_mpm_idx_decode(lc) :
++ ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
++
++ lc->pu.intra_pred_mode[i] =
++ luma_intra_pred_mode(s, lc,
++ x0 + ((i & 1) == 0 ? 0 : split_size),
++ y0 + ((i & 2) == 0 ? 0 : split_size),
++ log2_cb_size - split,
++ prev_intra_luma_pred_flag[i], idx);
++ }
++
++ if (ctx_cfmt(s) == 3) {
++ for (i = 0; i < n; i++) {
++ lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++ if (chroma_mode != 4) {
++ if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
++ lc->pu.intra_pred_mode_c[i] = 34;
++ else
++ lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
++ } else {
++ lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
++ }
++ }
++ } else if (ctx_cfmt(s) == 2) {
++ int mode_idx;
++ lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++ if (chroma_mode != 4) {
++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++ mode_idx = 34;
++ else
++ mode_idx = intra_chroma_table[chroma_mode];
++ } else {
++ mode_idx = lc->pu.intra_pred_mode[0];
++ }
++ lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
++ } else if (ctx_cfmt(s) != 0) {
++ chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++ if (chroma_mode != 4) {
++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++ lc->pu.intra_pred_mode_c[0] = 34;
++ else
++ lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
++ } else {
++ lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
++ }
++ }
++}
++
++static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
++{
++ const unsigned int cb_size = 1 << log2_cb_size;
++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ const unsigned int min_cb_width = s->ps.sps->min_cb_width;
++ const unsigned int x_cb = x0 >> log2_min_cb_size;
++ const unsigned int y_cb = y0 >> log2_min_cb_size;
++ const unsigned int idx = log2_cb_size - 2;
++ const unsigned int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
++ int skip_flag = 0;
++
++ lc->cu.x = x0;
++ lc->cu.y = y0;
++ lc->cu.x_split = x0;
++ lc->cu.y_split = y0;
++
++ lc->cu.pred_mode = MODE_INTRA;
++ lc->cu.part_mode = PART_2Nx2N;
++ lc->cu.intra_split_flag = 0;
++ lc->cu.cu_transquant_bypass_flag = 0;
++ lc->pu.intra_pred_mode[0] = 1;
++ lc->pu.intra_pred_mode[1] = 1;
++ lc->pu.intra_pred_mode[2] = 1;
++ lc->pu.intra_pred_mode[3] = 1;
++
++ if (s->ps.pps->transquant_bypass_enable_flag) {
++ lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
++ if (lc->cu.cu_transquant_bypass_flag)
++ set_deblocking_bypass(s, x0, y0, log2_cb_size);
++ }
++
++ if (s->sh.slice_type != HEVC_SLICE_I) {
++ lc->cu.pred_mode = MODE_INTER;
++ skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
++ }
++
++ if (skip_flag) {
++ lc->cu.pred_mode = MODE_SKIP;
++
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++
++ if (!s->sh.disable_deblocking_filter_flag)
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++ } else {
++ int pcm_flag = 0;
++
++ if (s->sh.slice_type != HEVC_SLICE_I)
++ lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
++ if (lc->cu.pred_mode != MODE_INTRA ||
++ log2_cb_size == s->ps.sps->log2_min_cb_size) {
++ lc->cu.part_mode = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
++ lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
++ lc->cu.pred_mode == MODE_INTRA;
++ }
++
++ if (lc->cu.pred_mode == MODE_INTRA) {
++ if (lc->cu.part_mode == PART_2Nx2N &&
++ log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size && // 0 if not enabled
++ log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
++ ff_hevc_rpi_pcm_flag_decode(lc) != 0)
++ {
++ int ret;
++ pcm_flag = 1;
++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++ if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
++ return ret;
++
++ if (s->ps.sps->pcm.loop_filter_disable_flag)
++ set_deblocking_bypass(s, x0, y0, log2_cb_size);
++ } else {
++ intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
++ }
++ } else {
++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++ switch (lc->cu.part_mode) {
++ case PART_2Nx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++ break;
++ case PART_2NxN:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx);
++ lc->cu.y_split = y0 + cb_size / 2;
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
++ break;
++ case PART_Nx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
++ lc->cu.x_split = x0 + cb_size / 2;
++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
++ break;
++ case PART_2NxnU:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx);
++ lc->cu.y_split = y0 + cb_size / 4;
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
++ break;
++ case PART_2NxnD:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
++ lc->cu.y_split = y0 + cb_size / 4 * 3;
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size / 4, log2_cb_size, 1, idx);
++ break;
++ case PART_nLx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2);
++ lc->cu.x_split = x0 + cb_size / 4;
++ hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
++ break;
++ case PART_nRx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
++ lc->cu.x_split = x0 + cb_size / 4 * 3;
++ hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2);
++ break;
++ case PART_NxN:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
++ lc->cu.x_split = x0 + cb_size / 2;
++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
++ lc->cu.y_split = y0 + cb_size / 2;
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
++ break;
++ }
++ }
++
++ if (!pcm_flag) {
++ int rqt_root_cbf = 1;
++
++ if (lc->cu.pred_mode != MODE_INTRA &&
++ !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
++ rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
++ }
++ if (rqt_root_cbf) {
++ const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
++ int ret;
++
++ lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
++ s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
++ s->ps.sps->max_transform_hierarchy_depth_inter;
++ // transform_tree does deblock_boundary_strengths
++ ret = hls_transform_tree(s, lc, x0, y0,
++ log2_cb_size, 0, 0, cbf_c);
++ if (ret < 0)
++ return ret;
++ } else {
++ if (!s->sh.disable_deblocking_filter_flag)
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++ }
++ }
++ }
++
++ // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
++ if (lc->tu.is_cu_qp_delta_wanted)
++ ff_hevc_rpi_set_qPy(s, lc, x0, y0);
++
++ if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++ ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
++ lc->qPy_pred = lc->qp_y;
++ }
++
++ set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
++
++ set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
++
++ return 0;
++}
++
++// Returns:
++// < 0 Error
++// 0 More data wanted
++// 1 EoSlice / EoPicture
++static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++ const int log2_cb_size, const unsigned int cb_depth)
++{
++ const int cb_size = 1 << log2_cb_size;
++ int ret;
++ int split_cu;
++
++ lc->ct_depth = cb_depth;
++ split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
++ if (x0 + cb_size <= s->ps.sps->width &&
++ y0 + cb_size <= s->ps.sps->height &&
++ split_cu)
++ {
++ split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
++ }
++
++ // Qp delta (and offset) need to remain wanted if cb_size < min until
++ // a coded block is found so we still initial state at depth 0 (outside
++ // this fn) and only reset here
++ if (s->ps.pps->cu_qp_delta_enabled_flag &&
++ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
++ {
++ lc->tu.is_cu_qp_delta_wanted = 1;
++ lc->tu.cu_qp_delta = 0;
++ }
++ if (s->sh.cu_chroma_qp_offset_enabled_flag &&
++ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
++ {
++ lc->tu.cu_chroma_qp_offset_wanted = 1;
++ }
++
++ lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
++ lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
++ lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
++
++ if (split_cu) {
++ int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
++ const int cb_size_split = cb_size >> 1;
++ const int x1 = x0 + cb_size_split;
++ const int y1 = y0 + cb_size_split;
++
++ int more_data = 0;
++
++ more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++
++ if (more_data && x1 < s->ps.sps->width) {
++ more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++ }
++ if (more_data && y1 < s->ps.sps->height) {
++ more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++ }
++ if (more_data && x1 < s->ps.sps->width &&
++ y1 < s->ps.sps->height) {
++ more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++ }
++
++ if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++ ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
++ lc->qPy_pred = lc->qp_y;
++
++ if (more_data)
++ return ((x1 + cb_size_split) < s->ps.sps->width ||
++ (y1 + cb_size_split) < s->ps.sps->height);
++ else
++ return 0;
++ } else {
++ ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
++ if (ret < 0)
++ return ret;
++ if ((!((x0 + cb_size) %
++ (1 << (s->ps.sps->log2_ctb_size))) ||
++ (x0 + cb_size >= s->ps.sps->width)) &&
++ (!((y0 + cb_size) %
++ (1 << (s->ps.sps->log2_ctb_size))) ||
++ (y0 + cb_size >= s->ps.sps->height))) {
++ int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
++ return !end_of_slice_flag;
++ } else {
++ return 1;
++ }
++ }
++
++ return 0; // NEVER
++}
++
++static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x_ctb, const int y_ctb, const int ctb_addr_ts)
++{
++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
++ const unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice
++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
++ const unsigned int line_w = s->ps.sps->ctb_width;
++
++ s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
++
++ lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
++ lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
++
++ lc->boundary_flags = 0;
++
++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
++ lc->boundary_flags |= BOUNDARY_LEFT_TILE;
++ if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
++ lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
++ if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
++ lc->boundary_flags |= BOUNDARY_UPPER_TILE;
++ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
++ lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
++
++ // Use line width rather than tile width for addr_in_slice test as
++ // addr_in_slice is in raster units
++
++ lc->ctb_avail =
++ ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
++ ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
++ ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
++ (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
++ ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
++ (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
++ // Down-left never avail at CTB level
++}
++
++
++static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
++ (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
++
++ // Signal
++ if (y > 0) {
++ // Cast away const as progress is held in s, but this really shouldn't confuse anything
++ ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
++ }
++
++ // Job done now
++ // ? Move outside this fn
++ job_free(s->jbc, jb);
++}
++
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
++static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ unsigned int i;
++ HEVCRpiIntraPredEnv * const iap = &jb->intra;
++ const HEVCPredCmd *cmd = iap->cmds;
++
++#if !RPI_WORKER_WAIT_PASS_0
++ rpi_sem_wait(&jb->sem);
++ rpi_cache_flush_execute(jb->rfe); // Invalidate data set up in pass1
++#endif
++
++ for (i = iap->n; i > 0; i--, cmd++)
++ {
++ switch (cmd->type)
++ {
++ case RPI_PRED_INTRA:
++ s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
++ break;
++ case RPI_PRED_INTRA_C:
++ s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
++ break;
++ case RPI_PRED_ADD_RESIDUAL:
++ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++ break;
++ case RPI_PRED_ADD_DC:
++ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++ break;
++ case RPI_PRED_ADD_RESIDUAL_U:
++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++ break;
++ case RPI_PRED_ADD_RESIDUAL_V:
++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++ break;
++ case RPI_PRED_ADD_RESIDUAL_C:
++ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++ break;
++ case RPI_PRED_ADD_DC_U:
++ case RPI_PRED_ADD_DC_V:
++ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++ break;
++
++ case RPI_PRED_I_PCM:
++ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
++ break;
++
++ default:
++ av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++ abort();
++ }
++ }
++
++ // Mark done
++ iap->n = 0;
++}
++
++
++// Set initial uniform job values & zero ctu_count
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
++{
++ unsigned int i;
++ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
++ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
++ const HEVCRpiSPS * const sps = s->ps.sps;
++
++ const uint16_t pic_width_y = sps->width;
++ const uint16_t pic_height_y = sps->height;
++
++ const uint16_t pic_width_c = sps->width >> ctx_hshift(s, 1);
++ const uint16_t pic_height_c = sps->height >> ctx_vshift(s, 1);
++
++ // We expect the pointer to change if we use another sps
++ if (sps != jb->sps)
++ {
++ worker_pic_free_one(jb);
++
++ set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
++ set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
++
++ {
++ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
++ const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
++ worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
++ }
++
++ jb->sps = sps;
++ }
++
++ jb->waited = 0;
++ jb->ctu_ts_first = ctu_ts_first;
++ jb->ctu_ts_last = -1;
++
++ rpi_inter_pred_reset(cipe);
++ for (i = 0; i < cipe->n; i++) {
++ HEVCRpiInterPredQ * const cp = cipe->q + i;
++ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
++
++ u->next_src1.x = 0;
++ u->next_src1.y = 0;
++ u->next_src1.base = 0;
++ u->pic_cw = pic_width_c;
++ u->pic_ch = pic_height_c;
++ u->stride2 = av_rpi_sand_frame_stride2(s->frame);
++ u->stride1 = av_rpi_sand_frame_stride1(s->frame);
++ cp->last_l0 = &u->next_src1;
++
++ u->next_fn = 0;
++ u->next_src2.x = 0;
++ u->next_src2.y = 0;
++ u->next_src2.base = 0;
++ cp->last_l1 = &u->next_src2;
++
++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++ }
++
++ rpi_inter_pred_reset(yipe);
++ for (i = 0; i < yipe->n; i++) {
++ HEVCRpiInterPredQ * const yp = yipe->q + i;
++ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
++
++ y->next_src1.x = 0;
++ y->next_src1.y = 0;
++ y->next_src1.base = 0;
++ y->next_src2.x = 0;
++ y->next_src2.y = 0;
++ y->next_src2.base = 0;
++ y->pic_h = pic_height_y;
++ y->pic_w = pic_width_y;
++ y->stride2 = av_rpi_sand_frame_stride2(s->frame);
++ y->stride1 = av_rpi_sand_frame_stride1(s->frame);
++ y->next_fn = 0;
++ yp->last_l0 = &y->next_src1;
++ yp->last_l1 = &y->next_src2;
++
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
++ }
++
++ jb->last_y8_p = NULL;
++ jb->last_y8_l1 = NULL;
++
++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++ jb->progress_req[i] = -1;
++ }
++
++ worker_pic_reset(&jb->coeffs);
++}
++
++
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
++ const vpu_qpu_job_h vqj,
++ rpi_cache_flush_env_t * const rfe,
++ HEVCRpiInterPredEnv * const ipe)
++{
++ unsigned int i;
++ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
++ unsigned int max_block = 0;
++
++ if (!ipe->used) {
++ return 0;
++ }
++
++ if (ipe->curr != 0) {
++ rpi_inter_pred_sync(ipe);
++ }
++
++ // Add final commands to Q
++ for(i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const yp = ipe->q + i;
++ qpu_mc_src_t *const p0 = yp->last_l0;
++ qpu_mc_src_t *const p1 = yp->last_l1;
++ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
++
++ if (block_size > max_block)
++ max_block = block_size;
++
++ qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit);
++
++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++ p0->x = MC_DUMMY_X;
++ p0->y = MC_DUMMY_Y;
++ p0->base = s->qpu_dummy_frame_qpu;
++ p1->x = MC_DUMMY_X;
++ p1->y = MC_DUMMY_Y;
++ p1->base = s->qpu_dummy_frame_qpu;
++
++ yp->last_l0 = NULL;
++ yp->last_l1 = NULL;
++
++ // Add to mailbox list
++ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
++ mail[i][1] = yp->code_setup;
++ }
++
++ // We don't need invalidate here as the uniforms aren't changed by the QPU
++ // and leaving them in ARM cache avoids (pointless) pre-reads when writing
++ // new values which seems to give us a small performance advantage
++ //
++ // In most cases we will not have a completely packed set of uniforms and as
++ // we have a 2d invalidate we writeback all uniform Qs to the depth of the
++ // fullest
++ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
++ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
++ ipe->n, ipe->max_fill + ipe->min_gap);
++ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
++
++ return 1;
++}
++#endif
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
++ const vpu_qpu_job_h vqj,
++ rpi_cache_flush_env_t * const rfe,
++ HEVCRpiInterPredEnv * const ipe)
++{
++ unsigned int i;
++ if (!ipe->used) {
++ return 0;
++ }
++
++ if (ipe->curr != 0) {
++ rpi_inter_pred_sync(ipe);
++ }
++
++ // Add final commands to Q
++ for(i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const yp = ipe->q + i;
++ qpu_mc_src_t *const p0 = yp->last_l0;
++ qpu_mc_src_t *const p1 = yp->last_l1;
++
++ yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++ p0->x = MC_DUMMY_X;
++ p0->y = MC_DUMMY_Y;
++ p0->base = s->qpu_dummy_frame_emu;
++ p1->x = MC_DUMMY_X;
++ p1->y = MC_DUMMY_Y;
++ p1->base = s->qpu_dummy_frame_emu;
++
++ yp->last_l0 = NULL;
++ yp->last_l1 = NULL;
++ }
++
++ return 1;
++}
++#endif
++
++
++#if RPI_QPU_EMU_Y
++#define mc_terminate_add_y mc_terminate_add_emu
++#else
++#define mc_terminate_add_y mc_terminate_add_qpu
++#endif
++#if RPI_QPU_EMU_C
++#define mc_terminate_add_c mc_terminate_add_emu
++#else
++#define mc_terminate_add_c mc_terminate_add_qpu
++#endif
++
++
++static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
++{
++ rpi_cache_buf_t cbuf;
++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
++ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++ rpi_cache_flush_finish(rfe);
++}
++
++static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
++ const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
++ const unsigned int ctb_width = s->ps.sps->ctb_width;
++ RpiBlk *const bounds = &jb->bounds;
++ av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
++ bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
++ bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
++ bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
++ bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
++
++ bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x);
++ bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y);
++}
++
++#if RPI_PASSES == 2
++static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ // Perform intra prediction and residual reconstruction
++ rpi_execute_pred_cmds(s, jb);
++
++ // Perform deblocking for CTBs in this row
++ rpi_execute_dblk_cmds(s, jb);
++}
++#endif
++
++// Core execution tasks
++static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ int pred_y, pred_c;
++ vpu_qpu_job_env_t qvbuf;
++ const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
++#if RPI_WORKER_WAIT_PASS_0
++ int do_wait;
++#endif
++
++ {
++ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++ if (cf->s[3].n + cf->s[2].n != 0)
++ {
++ const unsigned int csize = sizeof(cf->s[3].buf[0]);
++ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
++ unsigned int n16 = (cf->s[2].n >> 8);
++ unsigned int n32 = (cf->s[3].n >> 10);
++#if RPI_COMPRESS_COEFFS
++ if (cf->s[2].packed) {
++ n16 = n16 | (n16<<16);
++ } else {
++ const unsigned int npack16 = (cf->s[2].packed_n>>8);
++ n16 = n16 | (npack16<<16);
++ }
++ if (cf->s[3].packed) {
++ n32 = n32 | (n32<<16);
++ } else {
++ const unsigned int npack32 = (cf->s[3].packed_n>>10);
++ n32 = n32 | (npack32<<16);
++ }
++#endif
++ vpu_qpu_job_add_vpu(vqj,
++ vpu_get_fn(s->ps.sps->bit_depth),
++ vpu_get_constants(),
++ cf->gptr.vc,
++ n16,
++ cf->gptr.vc + offset32,
++ n32,
++ 0);
++
++ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
++ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
++ }
++ }
++
++ pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
++
++// We could take a sync here and try to locally overlap QPU processing with ARM
++// but testing showed a slightly negative benefit with noticable extra complexity
++
++ pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
++
++ // Returns 0 if nothing to do, 1 if sync added
++#if RPI_WORKER_WAIT_PASS_0
++ do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
++#else
++ if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
++ sem_post(&jb->sem);
++#endif
++
++ rpi_cache_flush_execute(jb->rfe);
++
++ // Await progress as required
++ // jb->waited will only be clear if we have already tested the progress values
++ // (in worker_submit_job) and found we don't have to wait
++ if (jb->waited)
++ {
++ unsigned int i;
++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++ if (jb->progress_req[i] >= 0) {
++ ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
++ }
++ }
++ }
++
++ vpu_qpu_job_finish(vqj);
++
++ // We always work on a rectangular block
++ if (pred_y || pred_c)
++ {
++ rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
++ jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
++ ctx_vshift(s, 1), pred_y, pred_c);
++ }
++
++ // If we have emulated VPU ops - do it here
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++ if (av_rpi_is_sand8_frame(s->frame))
++ {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
++#else
++ ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
++#endif
++ }
++ else
++ {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
++#else
++ ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
++#endif
++ }
++#endif
++
++#if RPI_WORKER_WAIT_PASS_0
++ if (do_wait)
++ rpi_sem_wait(&jb->sem);
++ rpi_cache_flush_execute(jb->rfe);
++#endif
++}
++
++
++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
++{
++ av_freep(&ipe->q);
++ gpu_free(&ipe->gptr);
++}
++
++static HEVCRpiJob * job_new(void)
++{
++ HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
++
++ if (jb == NULL)
++ return NULL;
++
++ sem_init(&jb->sem, 0, 0);
++ jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
++ ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
++
++ jb->intra.n = 0;
++ if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL)
++ goto fail1;
++
++ // * Sizeof the union structure might be overkill but at the moment it
++ // is correct (it certainly isn't going to be too small)
++ // Set max fill to slack/2 from the end of the Q
++ // If we exceed this in any Q then we will schedule by size (which should
++ // mean that we never use that Q again part from syncs)
++ // * Given how agressive the overflow resonse is we could maybe put the
++ // threshold even nearer the end, but I don't expect us to ever hit
++ // it on any real stream anyway.
++
++ if (rpi_inter_pred_alloc(&jb->chroma_ip,
++ QPU_N_MAX, QPU_N_GRP,
++ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
++ QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0)
++ goto fail2;
++ if (rpi_inter_pred_alloc(&jb->luma_ip,
++ QPU_N_MAX, QPU_N_GRP,
++ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
++ QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0)
++ goto fail3;
++
++ return jb;
++
++fail3:
++ rpi_free_inter_pred(&jb->luma_ip);
++fail2:
++ av_freep(&jb->intra.cmds);
++fail1:
++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++ rpi_cache_flush_finish(jb->rfe);
++ sem_destroy(&jb->sem);
++ return NULL;
++}
++
++static void job_delete(HEVCRpiJob * const jb)
++{
++ worker_pic_free_one(jb);
++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++ rpi_free_inter_pred(&jb->chroma_ip);
++ rpi_free_inter_pred(&jb->luma_ip);
++ av_freep(&jb->intra.cmds);
++ rpi_cache_flush_finish(jb->rfe); // Not really needed - should do nothing
++ sem_destroy(&jb->sem);
++ av_free(jb);
++}
++
++static void jbg_delete(HEVCRpiJobGlobal * const jbg)
++{
++ HEVCRpiJob * jb;
++
++ if (jbg == NULL)
++ return;
++
++ jb = jbg->free1;
++ while (jb != NULL)
++ {
++ HEVCRpiJob * const jb2 = jb;
++ jb = jb2->next;
++ job_delete(jb2);
++ }
++
++ pthread_mutex_destroy(&jbg->lock);
++ av_free(jbg);
++}
++
++static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
++{
++ HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
++ if (jbg == NULL)
++ return NULL;
++
++ pthread_mutex_init(&jbg->lock, NULL);
++
++ while (job_count-- != 0)
++ {
++ HEVCRpiJob * const jb = job_new();
++ if (jb == NULL)
++ goto fail;
++
++ jb->next = jbg->free1;
++ jbg->free1 = jb;
++ }
++
++ return jbg;
++
++fail:
++ jbg_delete(jbg);
++ return NULL;
++}
++
++static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
++{
++ HEVCRpiJobGlobal * jbg;
++
++ if (jbc == NULL)
++ return;
++
++ jbg = jbc->jbg;
++
++ if (jbc->jb1 != NULL)
++ job_delete(jbc->jb1);
++
++ pthread_mutex_destroy(&jbc->in_lock);
++ sem_destroy(&jbc->sem_out);
++ av_free(jbc);
++
++ // Deref the global job context
++ if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
++ jbg_delete(jbg);
++}
++
++static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
++{
++ HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
++
++ if (jbc == NULL)
++ return NULL;
++
++ jbc->jbg = jbg;
++ atomic_fetch_add(&jbg->ref_count, 1);
++
++ sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
++ pthread_mutex_init(&jbc->in_lock, NULL);
++
++ if ((jbc->jb1 = job_new()) == NULL)
++ goto fail;
++ jbc->jb1->jbc_local = jbc;
++
++ return jbc;
++
++fail:
++ rpi_job_ctl_delete(jbc);
++ return NULL;
++}
++
++
++
++static av_cold void hevc_init_worker(HEVCRpiContext * const s)
++{
++#if RPI_PASSES == 2
++ pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
++#elif RPI_PASSES == 3
++ pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
++ pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
++#else
++#error Passes confused
++#endif
++ pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
++
++ pass_queues_start_all(s);
++}
++
++static av_cold void hevc_exit_worker(HEVCRpiContext *s)
++{
++ pass_queues_term_all(s);
++
++ pass_queues_kill_all(s);
++
++ rpi_job_ctl_delete(s->jbc);
++ s->jbc = NULL;
++}
++
++
++static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
++{
++ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++ const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
++ const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
++
++ // Check for obvious disasters
++ if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
++ av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ // If dependant then ctb_addr_ts != 0 from previous check
++ if (s->sh.dependent_slice_segment_flag) {
++ int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
++ if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
++ av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++ tile_id + s->sh.num_entry_point_offsets >= tiles)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ // Tiled stuff must start at start of tile if it has multiple entry points
++ if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++ s->sh.num_entry_point_offsets != 0 &&
++ ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ ff_hevc_rpi_cabac_init_decoder(lc);
++
++ // Setup any required decode vars
++ lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
++
++// printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
++ lc->qp_y = s->sh.slice_qp;
++
++ // General setup
++ lc->bt_line_no = 0;
++ lc->ts = ctb_addr_ts;
++ return 0;
++}
++
++static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++ const GetBitContext * const gb = &s->HEVClc->gb;
++ RpiSliceHeader * const sh = &s->sh;
++ int i, j;
++
++ const unsigned int length = nal->size;
++ unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte
++ unsigned int cmpt;
++ unsigned int startheader;
++
++ if (sh->num_entry_point_offsets == 0) {
++ s->data = NULL;
++ return 0;
++ }
++
++ // offset in slice header includes emulation prevention bytes.
++ // Unfortunately those have been removed by the time we get here so we
++ // have to compensate. The nal layer keeps a track of where they were.
++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++ startheader--;
++ cmpt++;
++ }
++ }
++
++ for (i = 1; i < sh->num_entry_point_offsets; i++) {
++ offset += (sh->entry_point_offset[i - 1] - cmpt);
++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++ startheader--;
++ cmpt++;
++ }
++ }
++ if (sh->entry_point_offset[i] <= cmpt) {
++ av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
++ return AVERROR_INVALIDDATA;
++ }
++ sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
++ sh->offset[i - 1] = offset;
++ }
++
++ offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
++ if (length < offset) {
++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
++ return AVERROR_INVALIDDATA;
++ }
++ sh->size[sh->num_entry_point_offsets - 1] = length - offset;
++ sh->offset[sh->num_entry_point_offsets - 1] = offset;
++
++ // Remember data start pointer as we won't have nal later
++ s->data = nal->data;
++ return 0;
++}
++
++
++// Return
++// < 0 Error
++// 0 OK
++//
++// jb->ctu_ts_last < 0 Job still filling
++// jb->ctu_ts_last >= 0 Job ready
++
++static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
++{
++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++ const unsigned int ctb_size = (1 << log2_ctb_size);
++ HEVCRpiJob * const jb = lc->jb0;
++ int more_data = 1;
++ unsigned int ctb_addr_ts = lc->ts;
++ unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
++ const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
++
++ lc->unit_done = 0;
++
++ while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
++ {
++ int q_full;
++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
++
++ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
++
++ ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
++
++ hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
++
++ s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
++ s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset;
++ s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag;
++
++ // Zap stashes if navail
++ if ((lc->ctb_avail & AVAIL_U) == 0)
++ zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
++ if ((lc->ctb_avail & AVAIL_L) == 0)
++ {
++ memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
++ zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
++ }
++#if MVF_STASH_WIDTH > 64
++ // Restore left mvf stash at start of tile if not at start of line
++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
++ {
++ unsigned int i;
++ HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
++ const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++ {
++ *dst = *src++;
++ dst += MVF_STASH_WIDTH_PU;
++ }
++ }
++#endif
++
++ // Set initial tu states
++ lc->tu.cu_qp_delta = 0;
++ lc->tu.is_cu_qp_delta_wanted = 0;
++ lc->tu.cu_chroma_qp_offset_wanted = 0;
++
++ // Decode
++ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
++
++ if (ff_hevc_rpi_cabac_overflow(lc))
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
++ more_data = AVERROR_INVALIDDATA;
++ }
++
++ if (more_data < 0) {
++ s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN; // Mark slice as broken
++ return more_data;
++ }
++
++ if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
++ (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
++ {
++ if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
++ ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
++ return -1;
++ }
++ }
++
++ // --- Post CTB processing
++
++ // Stash rpl top/left for deblock that needs to remember such things cross-slice
++ s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
++ s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
++
++ if (!s->is_irap)
++ {
++ // Copy MVF up to up-left & stash to up
++ {
++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
++ HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
++
++ // printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
++
++ lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
++ memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
++ }
++ // Stash sideways if end of tile line but not end of line (no point)
++ // ** Could/should do this @ end of fn
++#if MVF_STASH_WIDTH > 64
++ if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
++#endif
++ {
++ unsigned int i;
++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
++ HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++ {
++ *dst++ = *src;
++ src += MVF_STASH_WIDTH_PU;
++ }
++ }
++ }
++
++ if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
++ ff_hevc_rpi_save_states(s, lc);
++
++ // Report progress so we can use our MVs in other frames
++ if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
++ ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
++
++ // End of line || End of tile line || End of tile
++ // (EoL covers end of frame for our purposes here)
++ q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
++
++ // Allocate QPU chunks on fixed size 64 pel boundries rather than
++ // whatever ctb_size is today.
++ // * We might quite like to continue to 64 pel vertical too but that
++ // currently confuses WPP
++ if (((x_ctb + ctb_size) & 63) == 0 || q_full)
++ {
++ int overflow = 0;
++ if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
++ overflow = 1;
++ if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
++ overflow = 1;
++ if (overflow)
++ {
++ // * This is very annoying (and slow) to cope with in WPP so
++ // we treat it as an error there (no known stream triggers this
++ // with the current buffer sizes). Non-wpp should cope fine.
++ av_log(s->avctx, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__);
++ q_full = 1;
++ }
++ }
++
++ // Inc TS to next.
++ ctb_addr_ts++;
++ ctb_addr_rs++;
++ x_ctb += ctb_size;
++
++ if (q_full)
++ {
++ // Do job
++ // Prep for submission
++ jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced
++ job_gen_bounds(s, jb);
++ break;
++ }
++
++ // If max_blocks started as 0 then this will never be true
++ if (--max_blocks == 0)
++ break;
++ }
++
++ lc->unit_done = (more_data <= 0);
++ lc->ts = ctb_addr_ts;
++ return 0;
++}
++
++static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
++{
++ lc->context = s;
++ lc->jb0 = NULL;
++ lc->lc_n = n;
++ lc->bt_terminate = 0;
++ lc->bt_psem_out = NULL;
++ sem_init(&lc->bt_sem_in, 0, 0);
++}
++
++#define TRACE_WPP 0
++#if RPI_EXTRA_BIT_THREADS > 0
++static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
++{
++ unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
++ return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
++}
++
++// Move local context parameters from an aux bit thread back to the main
++// thread at the end of a slice as processing is going to continue there.
++static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
++{
++ if (src_lc == dst_lc) {
++ return;
++ }
++
++ // Move the job
++ // We will still have an active job if the final line terminates early
++ // Dest should always be null by now
++ av_assert1(dst_lc->jb0 == NULL);
++ dst_lc->jb0 = src_lc->jb0;
++ src_lc->jb0 = NULL;
++
++ // Always need to store where we are in the bitstream
++ dst_lc->ts = src_lc->ts;
++ dst_lc->gb = src_lc->gb;
++ // Cabac init request will be built at start of next slice
++
++ // Need to store context if we might have a dependent seg
++ if (is_dep)
++ {
++ dst_lc->qPy_pred = src_lc->qPy_pred;
++ memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
++ memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
++ memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
++ }
++}
++
++static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
++{
++ rpi_sem_wait(&lc->bt_sem_in);
++ return lc->bt_terminate;
++}
++
++// Do one WPP line
++// Will not work correctly over horizontal tile boundries - vertical should be OK
++static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
++{
++ const int is_tile = lc->bt_is_tile;
++ const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
++ const unsigned int line = lc->bt_line_no;
++ const unsigned int line_inc = lc->bt_line_inc;
++ const int is_last = (line >= lc->bt_last_line);
++
++ const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
++ const unsigned int ts_next =
++ line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
++ INT_MAX :
++ is_tile ?
++ s->ps.pps->tile_pos_ts[tile_id + line_inc] :
++ lc->ts + lc->bt_line_width * line_inc;
++ // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
++ const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
++ unsigned int ts_prev;
++ int loop_n = 0;
++ int err = 0;
++
++ av_assert1(line <= s->sh.num_entry_point_offsets);
++
++#if TRACE_WPP
++ printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
++ lc->lc_n, is_tile ? "Tile" : "WPP", tile_id,
++ line, lc->bt_last_line, s->sh.num_entry_point_offsets,
++ lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
++#endif
++ if (line != 0)
++ {
++ const uint8_t * const data = s->data + s->sh.offset[line - 1];
++ const unsigned int len = s->sh.size[line - 1];
++ if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
++ return err;
++
++ ff_init_cabac_decoder(&lc->cc, data, len);
++ }
++
++ // We should never be processing a dependent slice here so reset is good
++ // ?? These probably shouldn't be needed (as they should be set by later
++ // logic) but do seem to be required
++ lc->qp_y = s->sh.slice_qp;
++
++ do
++ {
++ if (!is_last && loop_n > 1) {
++#if TRACE_WPP
++ printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
++#endif
++ sem_post(lc->bt_psem_out);
++ }
++ // The wait for loop_n == 0 has been done in bit_thread
++ if (!is_first && loop_n != 0)
++ {
++#if TRACE_WPP
++ printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
++#endif
++ if (wait_bt_sem_in(lc) != 0)
++ return AVERROR_EXIT;
++ }
++
++#if TRACE_WPP
++ {
++ int n;
++ sem_getvalue(&lc->bt_sem_in, &n);
++ printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
++ }
++#endif
++
++ ts_prev = lc->ts;
++
++ // If we have had an error - do no further decode but do continue
++ // moving signals around so the other threads continue to operate
++ // correctly (or at least as correctly as they can with this line missing)
++ //
++ // Errors in WPP/Tile are less fatal than normal as we have a good idea
++ // of how to restart on the next line so there is no need to give up totally
++ if (err != 0)
++ {
++ lc->unit_done = 0;
++ lc->ts += partial_size;
++ }
++ else
++ {
++ worker_pass0_ready(s, lc);
++
++ if ((err = fill_job(s, lc, partial_size)) < 0 ||
++ (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
++ {
++ if (err == 0) {
++ av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
++ err = AVERROR_INVALIDDATA;
++ }
++ worker_free(s, lc);
++ lc->ts = ts_prev + partial_size; // Pretend we did all that
++ lc->unit_done = 0;
++ }
++ else if (is_tile)
++ {
++ worker_submit_job(s, lc);
++ }
++ }
++
++ ++loop_n;
++ } while (lc->ts < ts_eol && !lc->unit_done);
++
++ // If we are on the last line & we didn't get a whole line we must wait for
++ // and sink the sem_posts from the line above / tile to the left.
++ while ((ts_prev += partial_size) < ts_eol)
++ {
++#if TRACE_WPP
++ printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
++#endif
++ if (wait_bt_sem_in(lc) != 0)
++ return AVERROR_EXIT;
++ }
++
++ lc->bt_line_no += line_inc;
++
++ if (!is_tile && err == 0)
++ worker_submit_job(s, lc);
++
++ if (!is_last) {
++ lc->ts = ts_next;
++
++#if TRACE_WPP
++ printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++ sem_post(lc->bt_psem_out);
++ if (loop_n > 1) {
++#if TRACE_WPP
++ printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++ sem_post(lc->bt_psem_out);
++ }
++ }
++ else
++ {
++ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); // * & not EoT
++#if MVF_STASH_WIDTH > 64
++ // Horrid calculations to work out what we want but luckily this should almost never execute
++ // **** Move to movlc
++ if (!s->is_irap)
++ {
++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
++ if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
++ {
++ const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
++ unsigned int i;
++ const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
++ HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
++
++ for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
++ {
++ *d_mvf = *s_mvf;
++ d_mvf += MVF_STASH_WIDTH_PU;
++ s_mvf += MVF_STASH_WIDTH_PU;
++ }
++
++ }
++ }
++#endif
++ // When all done poke the thread 0 sem_in one final time
++#if TRACE_WPP
++ printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
++#endif
++ sem_post(&s->HEVClcList[0]->bt_sem_in);
++ }
++
++#if TRACE_WPP
++ printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
++#endif
++ return err;
++}
++
++static void wpp_setup_lcs(HEVCRpiContext * const s)
++{
++ unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++ const unsigned int line_width = line_ts_width(s, ts);
++
++ for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
++ {
++ HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++ lc->ts = ts;
++ lc->bt_is_tile = 0;
++ lc->bt_line_no = i;
++ lc->bt_line_width = line_width;
++ lc->bt_last_line = s->sh.num_entry_point_offsets;
++ lc->bt_line_inc = RPI_BIT_THREADS;
++ ts += line_width;
++ }
++}
++
++
++// Can only process tile single row at once
++static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
++{
++ const HEVCRpiPPS * const pps = s->ps.pps;
++ const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++ const unsigned int tile0 = pps->tile_id[ts0];
++ const unsigned int col0 = tile0 % pps->num_tile_columns;
++
++ const unsigned int col = (slice_row == 0) ? col0 : 0;
++ unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
++ const unsigned int last_line = FFMIN(
++ line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
++
++ const unsigned int par =
++ FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
++#if TRACE_WPP
++ printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
++ pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
++#endif
++ for (unsigned int i = 0; i != par; ++i, ++line)
++ {
++ HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++ const unsigned int tile = tile0 + line;
++
++ lc->ts = pps->tile_pos_ts[tile];
++ lc->bt_line_no = line;
++ lc->bt_is_tile = 1;
++ lc->bt_line_width = line_ts_width(s, lc->ts);
++ lc->bt_last_line = last_line;
++ lc->bt_line_inc = par;
++ }
++}
++
++
++static void * bit_thread(void * v)
++{
++ HEVCRpiLocalContext * const lc = v;
++ HEVCRpiContext *const s = lc->context;
++
++ while (wait_bt_sem_in(lc) == 0)
++ {
++ int err;
++
++ if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp
++ if (lc->bt_terminate) {
++ av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
++ break;
++ }
++ av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
++ }
++ }
++
++ return NULL;
++}
++
++static int bit_threads_start(HEVCRpiContext * const s)
++{
++ if (s->bt_started)
++ return 0;
++
++ for (int i = 1; i < RPI_BIT_THREADS; ++i)
++ {
++ // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
++ if (s->HEVClcList[i] == NULL) {
++ if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
++ return -1;
++ }
++
++ bt_lc_init(s, s->HEVClcList[i], i);
++ job_lc_init(s->HEVClcList[i]);
++ }
++
++ // Link the sems in a circle
++ for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
++ s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
++ s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
++
++ // Init all lc before starting any threads
++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++ {
++ if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
++ return -1;
++ }
++
++ s->bt_started = 1;
++ return 0;
++}
++
++static int bit_threads_kill(HEVCRpiContext * const s)
++{
++ if (!s->bt_started)
++ return 0;
++ s->bt_started = 0;
++
++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++ {
++ HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
++ if (lc == NULL)
++ break;
++
++ lc->bt_terminate = 1;
++ sem_post(&lc->bt_sem_in);
++ pthread_join(s->bit_threads[i], NULL);
++
++ sem_destroy(&lc->bt_sem_in);
++ job_lc_kill(lc);
++ }
++ return 0;
++}
++#endif
++
++
++// If we are at EoT and the row is shorter than the number of jobs
++// we can Q we have to wait for it finish otherwise we risk cache/QPU
++// disasters
++static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
++{
++ return
++ s->ps.pps->tile_wpp_inter_disable >= 2 &&
++ s->sh.slice_type != HEVC_SLICE_I &&
++ n >= 0 &&
++ (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
++}
++
++static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++{
++ HEVCRpiContext * const s = avctxt->priv_data;
++ HEVCRpiLocalContext * const lc = s->HEVClc;
++ int err;
++
++ // Start of slice
++ if ((err = slice_start(s, lc)) != 0)
++ return err;
++
++#if RPI_EXTRA_BIT_THREADS > 0
++
++ if (s->sh.offload_tiles)
++ {
++ unsigned int slice_row = 0;
++
++#if TRACE_WPP
++ printf("%s: Do Tiles\n", __func__);
++#endif
++ // Generate & start extra bit threads if they aren't already running
++ bit_threads_start(s);
++
++ do
++ {
++ // Reset lc lines etc.
++ tile_one_row_setup_lcs(s, slice_row);
++
++#if TRACE_WPP
++ printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads
++#if TRACE_WPP
++ printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++ while (lc->bt_line_no <= lc->bt_last_line) {
++ rpi_sem_wait(&lc->bt_sem_in);
++ rpi_run_one_line(s, lc, 0);
++ }
++#if TRACE_WPP
++ printf("%s: Done body\n", __func__);
++#endif
++
++ // Wait for everything else to finish
++ rpi_sem_wait(&lc->bt_sem_in);
++
++ ++slice_row;
++ } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
++
++
++#if TRACE_WPP
++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++ }
++ else if (s->sh.offload_wpp)
++ {
++#if TRACE_WPP
++ printf("%s: Do WPP\n", __func__);
++#endif
++ // Generate & start extra bit threads if they aren't already running
++ bit_threads_start(s);
++
++ // Reset lc lines etc.
++ wpp_setup_lcs(s);
++
++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads
++#if TRACE_WPP
++ printf("%s: Done 1st\n", __func__);
++#endif
++
++ while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
++ rpi_sem_wait(&lc->bt_sem_in);
++ rpi_run_one_line(s, lc, 0);
++ }
++#if TRACE_WPP
++ printf("%s: Done body\n", __func__);
++#endif
++
++ // Wait for everything else to finish
++ rpi_sem_wait(&lc->bt_sem_in);
++
++#if TRACE_WPP
++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++ }
++ else
++#endif
++ {
++#if TRACE_WPP
++ printf("%s: Single start: ts=%d\n", __func__, lc->ts);
++#endif
++ // Single bit thread
++ do {
++ // Make sure we have space to prepare the next job
++ worker_pass0_ready(s, lc);
++
++ if ((err = fill_job(s, lc, 0)) < 0)
++ goto fail;
++
++ worker_submit_job(s, lc);
++
++ if (tile_needs_wait(s, lc->ts - 1))
++ worker_wait(s, lc);
++
++ } while (!lc->unit_done);
++
++#if TRACE_WPP
++ printf("%s: Single end: ts=%d\n", __func__, lc->ts);
++#endif
++ }
++
++ // If we have reached the end of the frame or
++ // then wait for the worker to finish all its jobs
++ if (lc->ts >= s->ps.sps->ctb_size)
++ worker_wait(s, lc);
++
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = &s->tstats;
++
++ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
++ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
++ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
++ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
++ ts->y_pred2_hgt16, ts->y_pred2_hle16);
++ memset(ts, 0, sizeof(*ts));
++ }
++#endif
++
++ return lc->ts;
++
++fail:
++ // Cleanup
++ av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
++ // Free our job & wait for temination
++ worker_free(s, lc);
++ worker_wait(s, lc);
++ return err;
++}
++
++
++static void set_no_backward_pred(HEVCRpiContext * const s)
++{
++ int i, j;
++ const RefPicList *const refPicList = s->refPicList;
++
++ s->no_backward_pred_flag = 0;
++ if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
++ return;
++
++ for (j = 0; j < 2; j++) {
++ for (i = 0; i < refPicList[j].nb_refs; i++) {
++ if (refPicList[j].list[i] > s->poc) {
++ s->no_backward_pred_flag = 1;
++ return;
++ }
++ }
++ }
++}
++
++static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++ int err;
++ if ((err = gen_entry_points(s, nal)) < 0)
++ return err;
++
++ set_no_backward_pred(s);
++
++ return rpi_decode_entry(s->avctx, NULL);
++}
++
++static int set_side_data(HEVCRpiContext *s)
++{
++ AVFrame *out = s->ref->frame;
++
++ if (s->sei.frame_packing.present &&
++ s->sei.frame_packing.arrangement_type >= 3 &&
++ s->sei.frame_packing.arrangement_type <= 5 &&
++ s->sei.frame_packing.content_interpretation_type > 0 &&
++ s->sei.frame_packing.content_interpretation_type < 3) {
++ AVStereo3D *stereo = av_stereo3d_create_side_data(out);
++ if (!stereo)
++ return AVERROR(ENOMEM);
++
++ switch (s->sei.frame_packing.arrangement_type) {
++ case 3:
++ if (s->sei.frame_packing.quincunx_subsampling)
++ stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
++ else
++ stereo->type = AV_STEREO3D_SIDEBYSIDE;
++ break;
++ case 4:
++ stereo->type = AV_STEREO3D_TOPBOTTOM;
++ break;
++ case 5:
++ stereo->type = AV_STEREO3D_FRAMESEQUENCE;
++ break;
++ }
++
++ if (s->sei.frame_packing.content_interpretation_type == 2)
++ stereo->flags = AV_STEREO3D_FLAG_INVERT;
++
++ if (s->sei.frame_packing.arrangement_type == 5) {
++ if (s->sei.frame_packing.current_frame_is_frame0_flag)
++ stereo->view = AV_STEREO3D_VIEW_LEFT;
++ else
++ stereo->view = AV_STEREO3D_VIEW_RIGHT;
++ }
++ }
++
++ if (s->sei.display_orientation.present &&
++ (s->sei.display_orientation.anticlockwise_rotation ||
++ s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
++ double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
++ AVFrameSideData *rotation = av_frame_new_side_data(out,
++ AV_FRAME_DATA_DISPLAYMATRIX,
++ sizeof(int32_t) * 9);
++ if (!rotation)
++ return AVERROR(ENOMEM);
++
++ av_display_rotation_set((int32_t *)rotation->data, angle);
++ av_display_matrix_flip((int32_t *)rotation->data,
++ s->sei.display_orientation.hflip,
++ s->sei.display_orientation.vflip);
++ }
++
++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++ // so the side data persists for the entire coded video sequence.
++ if (s->sei.mastering_display.present > 0 &&
++ IS_IRAP(s) && s->no_rasl_output_flag) {
++ s->sei.mastering_display.present--;
++ }
++ if (s->sei.mastering_display.present) {
++ // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
++ const int mapping[3] = {2, 0, 1};
++ const int chroma_den = 50000;
++ const int luma_den = 10000;
++ int i;
++ AVMasteringDisplayMetadata *metadata =
++ av_mastering_display_metadata_create_side_data(out);
++ if (!metadata)
++ return AVERROR(ENOMEM);
++
++ for (i = 0; i < 3; i++) {
++ const int j = mapping[i];
++ metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
++ metadata->display_primaries[i][0].den = chroma_den;
++ metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
++ metadata->display_primaries[i][1].den = chroma_den;
++ }
++ metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
++ metadata->white_point[0].den = chroma_den;
++ metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
++ metadata->white_point[1].den = chroma_den;
++
++ metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
++ metadata->max_luminance.den = luma_den;
++ metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
++ metadata->min_luminance.den = luma_den;
++ metadata->has_luminance = 1;
++ metadata->has_primaries = 1;
++
++ av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
++ av_log(s->avctx, AV_LOG_DEBUG,
++ "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
++ av_q2d(metadata->display_primaries[0][0]),
++ av_q2d(metadata->display_primaries[0][1]),
++ av_q2d(metadata->display_primaries[1][0]),
++ av_q2d(metadata->display_primaries[1][1]),
++ av_q2d(metadata->display_primaries[2][0]),
++ av_q2d(metadata->display_primaries[2][1]),
++ av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
++ av_log(s->avctx, AV_LOG_DEBUG,
++ "min_luminance=%f, max_luminance=%f\n",
++ av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
++ }
++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++ // so the side data persists for the entire coded video sequence.
++ if (s->sei.content_light.present > 0 &&
++ IS_IRAP(s) && s->no_rasl_output_flag) {
++ s->sei.content_light.present--;
++ }
++ if (s->sei.content_light.present) {
++ AVContentLightMetadata *metadata =
++ av_content_light_metadata_create_side_data(out);
++ if (!metadata)
++ return AVERROR(ENOMEM);
++ metadata->MaxCLL = s->sei.content_light.max_content_light_level;
++ metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
++
++ av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
++ av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
++ metadata->MaxCLL, metadata->MaxFALL);
++ }
++
++ if (s->sei.a53_caption.a53_caption) {
++ AVFrameSideData* sd = av_frame_new_side_data(out,
++ AV_FRAME_DATA_A53_CC,
++ s->sei.a53_caption.a53_caption_size);
++ if (sd)
++ memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
++ av_freep(&s->sei.a53_caption.a53_caption);
++ s->sei.a53_caption.a53_caption_size = 0;
++ s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
++ }
++
++ if (s->sei.alternative_transfer.present &&
++ av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
++ s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
++ s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
++ }
++
++ return 0;
++}
++
++static int hevc_frame_start(HEVCRpiContext * const s)
++{
++ int ret;
++
++ memset(s->bs_horizontal, 0, s->bs_size * 2); // Does V too
++ memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
++ memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
++
++ // Only need to remember intra for CIP
++ if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
++ s->is_intra = NULL;
++ else
++ {
++ s->is_intra = s->is_intra_store;
++ memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
++ }
++
++ s->is_decoded = 0;
++ s->first_nal_type = s->nal_unit_type;
++
++ s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
++
++ if (s->pkt.nb_nals > s->rpl_tab_size)
++ {
++ // In most cases it will be faster to free & realloc as that doesn't
++ // require (an unwanted) copy
++ av_freep(&s->rpl_tab);
++ s->rpl_tab_size = 0;
++ if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
++ goto fail;
++ s->rpl_tab_size = s->pkt.nb_nals;
++ }
++ memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
++
++ ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
++ if (ret < 0)
++ goto fail;
++
++ // Resize rpl_tab to max that we might want
++ ret = ff_hevc_rpi_frame_rps(s);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
++ goto fail;
++ }
++
++ s->ref->frame->key_frame = IS_IRAP(s);
++
++ ret = set_side_data(s);
++ if (ret < 0)
++ goto fail;
++
++ s->frame->pict_type = 3 - s->sh.slice_type;
++
++ if (!IS_IRAP(s))
++ ff_hevc_rpi_bump_frame(s);
++
++ av_frame_unref(s->output_frame);
++ ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
++ if (ret < 0)
++ goto fail;
++
++ ff_thread_finish_setup(s->avctx);
++
++ return 0;
++
++fail:
++ if (s->ref)
++ ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++ s->ref = NULL;
++ return ret;
++}
++
++static inline int is_non_ref_unit_type(const unsigned int nal_unit_type)
++{
++ // From Table 7-1
++ return (nal_unit_type & ~0xe) == 0; // True for 0, 2, 4, 6, 8, 10, 12, 14
++}
++
++static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
++{
++ GetBitContext * const gb = &s->HEVClc->gb;
++ int ctb_addr_ts, ret;
++
++ *gb = nal->gb;
++ s->nal_unit_type = nal->type;
++ s->temporal_id = nal->temporal_id;
++
++ switch (s->nal_unit_type) {
++ case HEVC_NAL_VPS:
++ ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_SPS:
++ ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
++ s->apply_defdispwin);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_PPS:
++ ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_SEI_PREFIX:
++ case HEVC_NAL_SEI_SUFFIX:
++ ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_TRAIL_R:
++ case HEVC_NAL_TRAIL_N:
++ case HEVC_NAL_TSA_N:
++ case HEVC_NAL_TSA_R:
++ case HEVC_NAL_STSA_N:
++ case HEVC_NAL_STSA_R:
++ case HEVC_NAL_BLA_W_LP:
++ case HEVC_NAL_BLA_W_RADL:
++ case HEVC_NAL_BLA_N_LP:
++ case HEVC_NAL_IDR_W_RADL:
++ case HEVC_NAL_IDR_N_LP:
++ case HEVC_NAL_CRA_NUT:
++ case HEVC_NAL_RADL_N:
++ case HEVC_NAL_RADL_R:
++ case HEVC_NAL_RASL_N:
++ case HEVC_NAL_RASL_R:
++ ret = hls_slice_header(s);
++ if (ret < 0)
++ return ret;
++
++ // The definition of _N unit types is "non-reference for other frames
++ // with the same temporal_id" so they may/will be ref frames for pics
++ // with a higher temporal_id.
++ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
++ !is_non_ref_unit_type(s->nal_unit_type);
++ s->offload_recon = s->threads_type != 0 && s->used_for_ref;
++ s->is_irap = IS_IRAP(s);
++
++#if DEBUG_DECODE_N
++ {
++ static int z = 0;
++ if (IS_IDR(s)) {
++ z = 1;
++ }
++ if (z != 0 && z++ > DEBUG_DECODE_N) {
++ s->is_decoded = 0;
++ break;
++ }
++ }
++#endif
++ if (
++ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
++ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
++ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
++ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s)))
++ {
++ s->is_decoded = 0;
++ break;
++ }
++
++ if (s->sh.first_slice_in_pic_flag) {
++ if (s->max_ra == INT_MAX) {
++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
++ s->max_ra = s->poc;
++ } else {
++ if (IS_IDR(s))
++ s->max_ra = INT_MIN;
++ }
++ }
++
++ if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
++ s->poc <= s->max_ra) {
++ s->is_decoded = 0;
++ break;
++ } else {
++ if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
++ s->max_ra = INT_MIN;
++ }
++
++ ret = hevc_frame_start(s);
++ if (ret < 0)
++ return ret;
++ } else if (!s->ref) {
++ av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
++ goto fail;
++ }
++
++ if (s->nal_unit_type != s->first_nal_type) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Non-matching NAL types of the VCL NALUs: %d %d\n",
++ s->first_nal_type, s->nal_unit_type);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (!s->sh.dependent_slice_segment_flag &&
++ s->sh.slice_type != HEVC_SLICE_I) {
++ ret = ff_hevc_rpi_slice_rpl(s);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_WARNING,
++ "Error constructing the reference lists for the current slice.\n");
++ goto fail;
++ }
++ }
++
++ ctb_addr_ts = hls_slice_data(s, nal);
++ if (ctb_addr_ts >= s->ps.sps->ctb_size) {
++ s->is_decoded = 1;
++ }
++
++ if (ctb_addr_ts < 0) {
++ ret = ctb_addr_ts;
++ goto fail;
++ }
++ break;
++ case HEVC_NAL_EOS_NUT:
++ case HEVC_NAL_EOB_NUT:
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ break;
++ case HEVC_NAL_AUD:
++ case HEVC_NAL_FD_NUT:
++ break;
++ default:
++ av_log(s->avctx, AV_LOG_INFO,
++ "Skipping NAL unit %d\n", s->nal_unit_type);
++ }
++
++ return 0;
++fail:
++ if (s->avctx->err_recognition & AV_EF_EXPLODE)
++ return ret;
++ return 0;
++}
++
++static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
++{
++ int i, ret = 0;
++ int eos_at_start = 1;
++
++ s->ref = NULL;
++ s->last_eos = s->eos;
++ s->eos = 0;
++
++ /* split the input packet into NAL units, so we know the upper bound on the
++ * number of slices in the frame */
++ ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
++ s->nal_length_size, s->avctx->codec_id, 0, 0);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Error splitting the input into NAL units.\n");
++ return ret;
++ }
++
++ for (i = 0; i < s->pkt.nb_nals; i++) {
++ if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
++ s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
++ if (eos_at_start) {
++ s->last_eos = 1;
++ } else {
++ s->eos = 1;
++ }
++ } else {
++ eos_at_start = 0;
++ }
++ }
++
++ /* decode the NAL units */
++ for (i = 0; i < s->pkt.nb_nals; i++) {
++ ret = decode_nal_unit(s, &s->pkt.nals[i]);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_WARNING,
++ "Error parsing NAL unit #%d.\n", i);
++ goto fail;
++ }
++ }
++
++fail: // Also success path
++ if (s->ref != NULL) {
++ if (s->used_for_ref && s->threads_type != 0) {
++ ff_hevc_rpi_progress_signal_all_done(s);
++ }
++ else {
++ // Flush frame to real memory as we expect to be able to pass
++ // it straight on to mmal
++ flush_frame(s, s->frame);
++ }
++ }
++ return ret;
++}
++
++static void print_md5(void *log_ctx, int level, uint8_t md5[16])
++{
++ int i;
++ for (i = 0; i < 16; i++)
++ av_log(log_ctx, level, "%02"PRIx8, md5[i]);
++}
++
++static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
++{
++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++ int pixel_shift;
++ int i, j;
++
++ if (!desc)
++ return AVERROR(EINVAL);
++
++ pixel_shift = desc->comp[0].depth > 8;
++
++ av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
++ s->poc);
++
++ /* the checksums are LE, so we have to byteswap for >8bpp formats
++ * on BE arches */
++#if HAVE_BIGENDIAN
++ if (pixel_shift && !s->checksum_buf) {
++ av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
++ FFMAX3(frame->linesize[0], frame->linesize[1],
++ frame->linesize[2]));
++ if (!s->checksum_buf)
++ return AVERROR(ENOMEM);
++ }
++#endif
++
++ for (i = 0; frame->data[i]; i++) {
++ int width = s->avctx->coded_width;
++ int height = s->avctx->coded_height;
++ int w = (i == 1 || i == 2) ? (width >> desc->log2_chroma_w) : width;
++ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
++ uint8_t md5[16];
++
++ av_md5_init(s->md5_ctx);
++ for (j = 0; j < h; j++) {
++ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
++#if HAVE_BIGENDIAN
++ if (pixel_shift) {
++ s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
++ (const uint16_t *) src, w);
++ src = s->checksum_buf;
++ }
++#endif
++ av_md5_update(s->md5_ctx, src, w << pixel_shift);
++ }
++ av_md5_final(s->md5_ctx, md5);
++
++ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
++ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
++ print_md5(s->avctx, AV_LOG_DEBUG, md5);
++ av_log (s->avctx, AV_LOG_DEBUG, "; ");
++ } else {
++ av_log (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
++ print_md5(s->avctx, AV_LOG_ERROR, md5);
++ av_log (s->avctx, AV_LOG_ERROR, " != ");
++ print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
++ av_log (s->avctx, AV_LOG_ERROR, "\n");
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ av_log(s->avctx, AV_LOG_DEBUG, "\n");
++
++ return 0;
++}
++
++static int all_sps_supported(const HEVCRpiContext * const s)
++{
++ for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++ if (s->ps.sps_list[i] != NULL)
++ {
++ const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++ if (!is_sps_supported(sps))
++ return 0;
++ }
++ }
++ return 1;
++}
++
++static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
++{
++ int ret, i;
++
++ ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
++ &s->nal_length_size, s->avctx->err_recognition,
++ s->apply_defdispwin, s->avctx);
++ if (ret < 0)
++ return ret;
++
++ /* export stream parameters from the first SPS */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++ if (first && s->ps.sps_list[i]) {
++ const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++ export_stream_params(s->avctx, &s->ps, sps);
++ break;
++ }
++ }
++
++ return 0;
++}
++
++static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
++ AVPacket *avpkt)
++{
++ int ret;
++ int new_extradata_size;
++ uint8_t *new_extradata;
++ HEVCRpiContext *s = avctx->priv_data;
++
++ if (!avpkt->size) {
++ ret = ff_hevc_rpi_output_frame(s, data, 1);
++ if (ret < 0)
++ return ret;
++
++ *got_output = ret;
++ return 0;
++ }
++
++ new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
++ &new_extradata_size);
++ if (new_extradata && new_extradata_size > 0) {
++ ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
++ if (ret < 0)
++ return ret;
++ }
++
++ s->ref = NULL;
++ ret = decode_nal_units(s, avpkt->data, avpkt->size);
++ if (ret < 0)
++ return ret;
++
++ /* verify the SEI checksum */
++ if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
++ s->sei.picture_hash.is_md5) {
++ ret = verify_md5(s, s->ref->frame);
++ if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
++ ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++ return ret;
++ }
++ }
++ s->sei.picture_hash.is_md5 = 0;
++
++ if (s->is_decoded) {
++ av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
++ s->is_decoded = 0;
++ }
++
++ if (s->output_frame->buf[0]) {
++ av_frame_move_ref(data, s->output_frame);
++ *got_output = 1;
++ }
++
++ return avpkt->size;
++}
++
++static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
++{
++ int ret;
++
++ ret = ff_thread_ref_frame(&dst->tf, &src->tf);
++ if (ret < 0)
++ return ret;
++
++ if (src->col_mvf_buf != NULL)
++ {
++ dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
++ if (!dst->col_mvf_buf)
++ goto fail;
++ }
++ dst->col_mvf = src->col_mvf;
++
++ dst->poc = src->poc;
++ dst->flags = src->flags;
++ dst->sequence = src->sequence;
++ return 0;
++
++fail:
++ ff_hevc_rpi_unref_frame(s, dst, ~0);
++ return AVERROR(ENOMEM);
++}
++
++
++static av_cold int hevc_decode_free(AVCodecContext *avctx)
++{
++ HEVCRpiContext * const s = avctx->priv_data;
++ int i;
++
++ pic_arrays_free(s);
++
++ av_freep(&s->md5_ctx);
++
++ av_freep(&s->cabac_save);
++
++#if RPI_EXTRA_BIT_THREADS
++ bit_threads_kill(s);
++#endif
++
++ hevc_exit_worker(s);
++ for (i = 0; i != 2; ++i) {
++ ff_hevc_rpi_progress_kill_state(s->progress_states + i);
++ }
++ job_lc_kill(s->HEVClc);
++
++ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0]
++ av_freep(&s->sao_pixel_buffer_v[0]);
++ av_frame_free(&s->output_frame);
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++ av_frame_free(&s->DPB[i].frame);
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
++ av_buffer_unref(&s->ps.vps_list[i]);
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
++ av_buffer_unref(&s->ps.sps_list[i]);
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
++ av_buffer_unref(&s->ps.pps_list[i]);
++ s->ps.sps = NULL;
++ s->ps.pps = NULL;
++ s->ps.vps = NULL;
++
++ // Free separately from sLists as used that way by RPI WPP
++ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
++ av_freep(s->HEVClcList + i);
++ }
++ s->HEVClc = NULL; // Allocated as part of HEVClcList
++
++ ff_h2645_packet_uninit(&s->pkt);
++
++ if (s->qpu_init_ok)
++ vpu_qpu_term();
++ s->qpu_init_ok = 0;
++
++ return 0;
++}
++
++
++static av_cold int hevc_init_context(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ int i;
++
++ s->avctx = avctx;
++
++ s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
++ if (!s->HEVClc)
++ goto fail;
++ s->HEVClcList[0] = s->HEVClc;
++
++ if (vpu_qpu_init() != 0)
++ goto fail;
++ s->qpu_init_ok = 1;
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++ {
++ static const uint32_t dframe[1] = {0x80808080};
++ s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
++ }
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++ s->qpu_dummy_frame_qpu = qpu_dummy();
++#endif
++
++ bt_lc_init(s, s->HEVClc, 0);
++ job_lc_init(s->HEVClc);
++
++ for (i = 0; i != 2; ++i) {
++ ff_hevc_rpi_progress_init_state(s->progress_states + i);
++ }
++
++ if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
++ goto fail;
++
++ if ((s->output_frame = av_frame_alloc()) == NULL)
++ goto fail;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ s->DPB[i].frame = av_frame_alloc();
++ if (!s->DPB[i].frame)
++ goto fail;
++ s->DPB[i].tf.f = s->DPB[i].frame;
++ s->DPB[i].dpb_no = i;
++ }
++
++ s->max_ra = INT_MAX;
++
++ if ((s->md5_ctx = av_md5_alloc()) == NULL)
++ goto fail;
++
++ s->context_initialized = 1;
++ s->eos = 0;
++
++ ff_hevc_rpi_reset_sei(&s->sei);
++
++ return 0;
++
++fail:
++ av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
++ hevc_decode_free(avctx);
++ return AVERROR(ENOMEM);
++}
++
++#if HAVE_THREADS
++static int hevc_update_thread_context(AVCodecContext *dst,
++ const AVCodecContext *src)
++{
++ HEVCRpiContext *s = dst->priv_data;
++ HEVCRpiContext *s0 = src->priv_data;
++ int i, ret;
++
++ av_assert0(s->context_initialized);
++
++ // dst == src can happen according to the comments and in that case
++ // there is nothing to do here
++ if (dst == src)
++ return 0;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++ if (s0->DPB[i].frame->buf[0]) {
++ ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
++ if (ret < 0)
++ return ret;
++ }
++ }
++
++ if (s->ps.sps != s0->ps.sps)
++ s->ps.sps = NULL;
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
++ av_buffer_unref(&s->ps.vps_list[i]);
++ if (s0->ps.vps_list[i]) {
++ s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
++ if (!s->ps.vps_list[i])
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++ av_buffer_unref(&s->ps.sps_list[i]);
++ if (s0->ps.sps_list[i]) {
++ s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
++ if (!s->ps.sps_list[i])
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
++ av_buffer_unref(&s->ps.pps_list[i]);
++ if (s0->ps.pps_list[i]) {
++ s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
++ if (!s->ps.pps_list[i])
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ if (s->ps.sps != s0->ps.sps)
++ if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
++ return ret;
++
++ s->seq_decode = s0->seq_decode;
++ s->seq_output = s0->seq_output;
++ s->pocTid0 = s0->pocTid0;
++ s->max_ra = s0->max_ra;
++ s->eos = s0->eos;
++ s->no_rasl_output_flag = s0->no_rasl_output_flag;
++
++ s->is_nalff = s0->is_nalff;
++ s->nal_length_size = s0->nal_length_size;
++
++ s->threads_type = s0->threads_type;
++
++ if (s0->eos) {
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ }
++
++ s->sei.frame_packing = s0->sei.frame_packing;
++ s->sei.display_orientation = s0->sei.display_orientation;
++ s->sei.mastering_display = s0->sei.mastering_display;
++ s->sei.content_light = s0->sei.content_light;
++ s->sei.alternative_transfer = s0->sei.alternative_transfer;
++
++ // * We do this here as it allows us to easily locate our parents
++ // global job pool, but there really should be a less nasty way
++ if (s->jbc == NULL)
++ {
++ av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
++ hevc_init_worker(s);
++ }
++
++ return 0;
++}
++#endif
++
++#include <sys/stat.h>
++static int qpu_ok(void)
++{
++ static int is_pi3 = -1;
++ if (is_pi3 == -1)
++ {
++ struct stat sb;
++ is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0);
++ }
++ return is_pi3;
++}
++
++static av_cold int hevc_decode_init(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ int ret;
++
++ if (!qpu_ok())
++ return AVERROR_DECODER_NOT_FOUND;
++
++ if ((ret = hevc_init_context(avctx)) < 0)
++ return ret;
++
++ // If we are a child context then stop now
++ // Everything after this point is either 1st decode setup or global alloc
++ // that must not be repeated
++ // Global info will be copied into children in update_thread_context (we
++ // can't do it here as we have no way of finding the parent context)
++ if (avctx->internal->is_copy)
++ return 0;
++
++ // Job allocation requires VCSM alloc to work so ensure that we have it
++ // initialised by this point
++ {
++ HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
++ if (jbg == NULL) {
++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
++ ret = AVERROR(ENOMEM);
++ goto fail;
++ }
++
++ if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) {
++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
++ ret = AVERROR(ENOMEM);
++ goto fail;
++ }
++ }
++
++ hevc_init_worker(s);
++
++ s->eos = 1;
++
++ if (avctx->extradata_size > 0 && avctx->extradata) {
++ if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0)
++ goto fail;
++
++ if (!all_sps_supported(s)) {
++ ret = AVERROR_DECODER_NOT_FOUND;
++ goto fail;
++ }
++ }
++
++ if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++ s->threads_type = FF_THREAD_FRAME;
++ else
++ s->threads_type = 0;
++
++ return 0;
++
++fail:
++ hevc_decode_free(avctx);
++ return ret;
++}
++
++static void hevc_decode_flush(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ ff_hevc_rpi_flush_dpb(s);
++ s->max_ra = INT_MAX;
++ s->eos = 1;
++}
++
++typedef struct hwaccel_rpi3_qpu_env_s {
++ const AVClass *av_class;
++ AVZcEnvPtr zc;
++} hwaccel_rpi3_qpu_env_t;
++
++static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame)
++{
++ hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data;
++ int rv;
++
++ if (av_rpi_zc_in_use(s))
++ {
++ rv = s->get_buffer2(s, frame, 0);
++ }
++ else
++ {
++ rv = av_rpi_zc_get_buffer(r3->zc, frame);
++ if (rv == 0)
++ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); // actually do the alloc
++ }
++
++ if (rv == 0 &&
++ (rv = ff_attach_decode_data(frame)) < 0)
++ {
++ av_frame_unref(frame);
++ }
++
++ return rv;
++}
++
++static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx)
++{
++ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
++ av_rpi_zc_int_env_freep(&r3->zc);
++ return 0;
++}
++
++static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx)
++{
++ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
++
++ if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL)
++ goto fail;
++
++ return 0;
++
++fail:
++ av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n");
++ hwaccel_rpi3_qpu_free(avctx);
++ return AVERROR(ENOMEM);
++}
++
++
++#define OFFSET(x) offsetof(HEVCRpiContext, x)
++#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
++
++
++static const AVOption options[] = {
++ { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++ { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++ { NULL },
++};
++
++static const AVClass hevc_rpi_decoder_class = {
++ .class_name = "HEVC RPI decoder",
++ .item_name = av_default_item_name,
++ .option = options,
++ .version = LIBAVUTIL_VERSION_INT,
++};
++
++static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
++ AV_PIX_FMT_SAND128,
++ AV_PIX_FMT_SAND64_10,
++ AV_PIX_FMT_NONE
++};
++
++
++static const AVHWAccel hwaccel_rpi3_qpu = {
++ .name = "Pi3 QPU Hwaccel",
++ .alloc_frame = hwaccel_alloc_frame,
++ .init = hwaccel_rpi3_qpu_init,
++ .uninit = hwaccel_rpi3_qpu_free,
++ .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t),
++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 =
++{
++ .public = {
++ .pix_fmt = AV_PIX_FMT_SAND128,
++ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
++ .device_type = AV_HWDEVICE_TYPE_NONE,
++ },
++ .hwaccel = &hwaccel_rpi3_qpu
++};
++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 =
++{
++ .public = {
++ .pix_fmt = AV_PIX_FMT_SAND64_10,
++ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
++ .device_type = AV_HWDEVICE_TYPE_NONE,
++ },
++ .hwaccel = &hwaccel_rpi3_qpu
++};
++
++
++static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
++ &hevc_rpi_hw_config_sand128,
++ &hevc_rpi_hw_config_sand64_10,
++ NULL
++};
++
++
++AVCodec ff_hevc_rpi_decoder = {
++ .name = "hevc_rpi",
++ .long_name = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
++ .type = AVMEDIA_TYPE_VIDEO,
++ .id = AV_CODEC_ID_HEVC,
++ .priv_data_size = sizeof(HEVCRpiContext),
++ .priv_class = &hevc_rpi_decoder_class,
++ .init = hevc_decode_init,
++ .close = hevc_decode_free,
++ .decode = hevc_rpi_decode_frame,
++ .flush = hevc_decode_flush,
++ .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context),
++ .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
++ AV_CODEC_CAP_HARDWARE |
++ AV_CODEC_CAP_AVOID_PROBING |
++#if 0
++ // Debugging is often easier without threads getting in the way
++ 0,
++#warning H265 threading turned off
++#else
++ // We only have decent optimisation for frame - so only admit to that
++ AV_CODEC_CAP_FRAME_THREADS,
++#endif
++ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE |
++ FF_CODEC_CAP_EXPORTS_CROPPING |
++ FF_CODEC_CAP_ALLOCATE_PROGRESS,
++ .pix_fmts = hevc_rpi_pix_fmts,
++ .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++ .hw_configs = hevc_rpi_hw_configs,
++// .wrapper_name = "hevc_rpi",
++};
++
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.h
+@@ -0,0 +1,1091 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDEC_H
++#define AVCODEC_RPI_HEVCDEC_H
++
++#include "config.h"
++
++#include <stdatomic.h>
++
++#include "libavutil/buffer.h"
++
++#include "avcodec.h"
++#include "bswapdsp.h"
++#include "cabac.h"
++#include "get_bits.h"
++#include "rpi_hevcpred.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_mv.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++#include "rpi_hevcdsp.h"
++#include "internal.h"
++#include "thread.h"
++#include "videodsp.h"
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_misc_neon.h"
++#endif
++
++#define MAX_NB_THREADS 16
++#define SHIFT_CTB_WPP 2
++
++//TODO: check if this is really the maximum
++#define MAX_TRANSFORM_DEPTH 5
++
++#define MAX_TB_SIZE 32
++#define MAX_QP 51
++#define DEFAULT_INTRA_TC_OFFSET 2
++
++#define HEVC_CONTEXTS 199
++
++#define MRG_MAX_NUM_CANDS 5
++
++#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64
++
++// Size of DPB array
++#define HEVC_DPB_ELS 32
++
++#define L0 0
++#define L1 1
++
++#define EPEL_EXTRA_BEFORE 1
++#define EPEL_EXTRA_AFTER 2
++#define EPEL_EXTRA 3
++#define QPEL_EXTRA_BEFORE 3
++#define QPEL_EXTRA_AFTER 4
++#define QPEL_EXTRA 7
++
++#define EDGE_EMU_BUFFER_STRIDE 80
++
++#include <semaphore.h>
++#include "rpi_qpu.h"
++
++// Max jobs per frame thread. Actual usage will be limited by the size
++// of the global job pool
++// ?? Limits
++#define RPI_MAX_JOBS 8
++
++// This is the number of _extra_ bit threads - we will have
++// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
++//
++// 0 is legitimate and will disable our WPP processing
++//#define RPI_EXTRA_BIT_THREADS 0
++#define RPI_EXTRA_BIT_THREADS 2
++
++// Number of separate threads/passes in worker
++// 2 and 3 are the currently valid numbers
++// At the moment 3 seems fractionally faster
++//#define RPI_PASSES 2
++#define RPI_PASSES 3
++
++// Print out various usage stats
++#define RPI_TSTATS 0
++
++// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
++#define RPI_COMPRESS_COEFFS 1
++
++// Wait for VPU/QPU to finish in worker pass 0
++// If 0 then the wait is in pass 1
++//
++// One might expect the better place to wait would be in pass 1 however
++// testing shows that pass 0 produces overall faster decode.
++// Interestingly it is QPU/VPU limited streams that seem to suffer
++// from pass 1 waits, CPU limited ones tend to show a very mild gain.
++// This define exists so it is easy to test this.
++#define RPI_WORKER_WAIT_PASS_0 1
++
++// Use ARM emulation of QPU pred
++// These are for debug only as the emulation makes only limited
++// effort to be fast
++#define RPI_QPU_EMU_Y 0
++#define RPI_QPU_EMU_C 0
++
++// Max width & height we are prepared to consider
++// Sand frame shape calc becomes confused with large frames
++// Some buffer alloc also depends on this
++#define HEVC_RPI_MAX_WIDTH 2048
++#define HEVC_RPI_MAX_HEIGHT 1088
++
++
++// Min CTB size is 16
++#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
++
++/**
++ * Value of the luma sample at position (x, y) in the 2D array tab.
++ */
++#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
++#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
++
++#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
++#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
++ (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
++#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
++
++enum RPSType {
++ ST_CURR_BEF = 0,
++ ST_CURR_AFT,
++ ST_FOLL,
++ LT_CURR,
++ LT_FOLL,
++ NB_RPS_TYPE,
++};
++
++enum SyntaxElement {
++ SAO_MERGE_FLAG = 0,
++ SAO_TYPE_IDX,
++ SAO_EO_CLASS,
++ SAO_BAND_POSITION,
++ SAO_OFFSET_ABS,
++ SAO_OFFSET_SIGN,
++ END_OF_SLICE_FLAG,
++ SPLIT_CODING_UNIT_FLAG,
++ CU_TRANSQUANT_BYPASS_FLAG,
++ SKIP_FLAG,
++ CU_QP_DELTA,
++ PRED_MODE_FLAG,
++ PART_MODE,
++ PCM_FLAG,
++ PREV_INTRA_LUMA_PRED_FLAG,
++ MPM_IDX,
++ REM_INTRA_LUMA_PRED_MODE,
++ INTRA_CHROMA_PRED_MODE,
++ MERGE_FLAG,
++ MERGE_IDX,
++ INTER_PRED_IDC,
++ REF_IDX_L0,
++ REF_IDX_L1,
++ ABS_MVD_GREATER0_FLAG,
++ ABS_MVD_GREATER1_FLAG,
++ ABS_MVD_MINUS2,
++ MVD_SIGN_FLAG,
++ MVP_LX_FLAG,
++ NO_RESIDUAL_DATA_FLAG,
++ SPLIT_TRANSFORM_FLAG,
++ CBF_LUMA,
++ CBF_CB_CR,
++ TRANSFORM_SKIP_FLAG,
++ EXPLICIT_RDPCM_FLAG,
++ EXPLICIT_RDPCM_DIR_FLAG,
++ LAST_SIGNIFICANT_COEFF_X_PREFIX,
++ LAST_SIGNIFICANT_COEFF_Y_PREFIX,
++ LAST_SIGNIFICANT_COEFF_X_SUFFIX,
++ LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
++ SIGNIFICANT_COEFF_GROUP_FLAG,
++ SIGNIFICANT_COEFF_FLAG,
++ COEFF_ABS_LEVEL_GREATER1_FLAG,
++ COEFF_ABS_LEVEL_GREATER2_FLAG,
++ COEFF_ABS_LEVEL_REMAINING,
++ COEFF_SIGN_FLAG,
++ LOG2_RES_SCALE_ABS,
++ RES_SCALE_SIGN_FLAG,
++ CU_CHROMA_QP_OFFSET_FLAG,
++ CU_CHROMA_QP_OFFSET_IDX,
++};
++
++enum PartMode {
++ PART_2Nx2N = 0,
++ PART_2NxN = 1,
++ PART_Nx2N = 2,
++ PART_NxN = 3,
++ PART_2NxnU = 4,
++ PART_2NxnD = 5,
++ PART_nLx2N = 6,
++ PART_nRx2N = 7,
++};
++
++enum PredMode {
++ MODE_INTER = 0,
++ MODE_INTRA,
++ MODE_SKIP,
++};
++
++enum InterPredIdc {
++ PRED_L0 = 0,
++ PRED_L1,
++ PRED_BI,
++};
++
++enum PredFlag {
++ PF_INTRA = 0,
++ PF_L0,
++ PF_L1,
++ PF_BI,
++};
++
++enum SAOType {
++ SAO_NOT_APPLIED = 0,
++ SAO_BAND,
++ SAO_EDGE,
++ SAO_APPLIED
++};
++
++enum SAOEOClass {
++ SAO_EO_HORIZ = 0,
++ SAO_EO_VERT,
++ SAO_EO_135D,
++ SAO_EO_45D,
++};
++
++enum ScanType {
++ SCAN_DIAG = 0,
++ SCAN_HORIZ,
++ SCAN_VERT,
++};
++
++typedef struct RefPicList {
++ struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
++ int list[HEVC_MAX_REFS];
++ uint8_t isLongTerm[HEVC_MAX_REFS];
++ int nb_refs;
++} RefPicList;
++
++typedef struct RefPicListTab {
++ RefPicList refPicList[2];
++} RefPicListTab;
++
++typedef struct RpiCodingUnit {
++ unsigned int x; // Passed to deblock
++ unsigned int y;
++ unsigned int x_split;
++ unsigned int y_split;
++
++ enum PredMode pred_mode; ///< PredMode
++ enum PartMode part_mode; ///< PartMode
++
++ // Inferred parameters
++ uint8_t intra_split_flag; ///< IntraSplitFlag
++ uint8_t max_trafo_depth; ///< MaxTrafoDepth
++ uint8_t cu_transquant_bypass_flag;
++} RpiCodingUnit;
++
++typedef struct RpiPredictionUnit {
++ uint8_t intra_pred_mode[4];
++ uint8_t intra_pred_mode_c[4];
++ uint8_t chroma_mode_c[4];
++ uint8_t merge_flag;
++} RpiPredictionUnit;
++
++typedef struct HEVCRpiTransformUnit {
++ int8_t cu_qp_delta;
++
++ // Inferred parameters;
++ uint8_t intra_pred_mode;
++ uint8_t intra_pred_mode_c;
++ uint8_t chroma_mode_c;
++ uint8_t is_cu_qp_delta_wanted;
++ uint8_t cu_chroma_qp_offset_wanted;
++ const int8_t * qp_divmod6[3];
++} HEVCRpiTransformUnit;
++
++typedef struct DBParams {
++ int8_t beta_offset; // -12 to +12
++ int8_t tc_offset; // -12 to +12
++} DBParams;
++
++#define HEVC_FRAME_FLAG_OUTPUT (1 << 0)
++#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
++#define HEVC_FRAME_FLAG_LONG_REF (1 << 2)
++#define HEVC_FRAME_FLAG_BUMPING (1 << 3)
++
++struct HEVCRpiJob;
++
++typedef struct HEVCRpiFrame {
++ AVFrame *frame;
++ ThreadFrame tf;
++ ColMvField *col_mvf;
++ int poc;
++ struct HEVCRpiFrame *collocated_ref;
++
++ AVBufferRef *col_mvf_buf;
++
++ /**
++ * A sequence counter, so that old frames are output first
++ * after a POC reset
++ */
++ uint16_t sequence;
++
++ /**
++ * A combination of HEVC_FRAME_FLAG_*
++ */
++ uint8_t flags;
++
++ // Entry no in DPB - can be used as a small unique
++ // frame identifier (within the current thread)
++ uint8_t dpb_no;
++} HEVCRpiFrame;
++
++typedef struct HEVCRpiLocalContext {
++ HEVCRpiTransformUnit tu;
++
++ CABACContext cc;
++
++ // Vars that allow us to locate everything from just an lc
++ struct HEVCRpiContext * context; // ??? make const ???
++ unsigned int lc_n; // lc list el no
++
++ // Job wait links
++ struct HEVCRpiLocalContext * jw_next;
++ struct HEVCRpiLocalContext * jw_prev;
++ struct HEVCRpiLocalContext * ljw_next;
++ struct HEVCRpiLocalContext * ljw_prev;
++ struct HEVCRpiJob * volatile jw_job;
++ sem_t jw_sem;
++
++ // ?? Wrap in structure ??
++ sem_t bt_sem_in;
++ sem_t * bt_psem_out;
++ volatile int bt_terminate;
++ unsigned int ts;
++ unsigned int bt_last_line; // Last line in this bit_thread chunk
++ unsigned int bt_line_no;
++ unsigned int bt_line_width;
++ unsigned int bt_line_inc;
++
++ struct HEVCRpiJob * jb0;
++ char unit_done; // Set once we have dealt with this slice
++ char bt_is_tile;
++ char last_progress_good;
++ char cabac_init_req;
++
++ uint8_t cabac_state[HEVC_CONTEXTS];
++ uint8_t stat_coeff[4];
++ GetBitContext gb;
++
++ uint8_t ct_depth;
++ int8_t qp_y;
++ int8_t curr_qp_y;
++ int8_t qPy_pred;
++
++// N.B. Used by asm (neon) - do not change
++#define AVAIL_S_UR 0
++#define AVAIL_S_U 1
++#define AVAIL_S_UL 2
++#define AVAIL_S_L 3
++#define AVAIL_S_DL 4
++
++#define AVAIL_U (1 << AVAIL_S_U)
++#define AVAIL_L (1 << AVAIL_S_L)
++#define AVAIL_UL (1 << AVAIL_S_UL)
++#define AVAIL_UR (1 << AVAIL_S_UR)
++#define AVAIL_DL (1 << AVAIL_S_DL)
++
++// Intra filters - same number space as avail
++#define FILTER_LIGHT 0x40
++#define FILTER_STRONG 0x80
++#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG)
++
++ uint8_t ctb_avail;
++ int end_of_ctb_x;
++ int end_of_ctb_y;
++
++ RpiCodingUnit cu;
++ RpiPredictionUnit pu;
++
++#define BOUNDARY_LEFT_SLICE (1 << 0)
++#define BOUNDARY_LEFT_TILE (1 << 1)
++#define BOUNDARY_UPPER_SLICE (1 << 2)
++#define BOUNDARY_UPPER_TILE (1 << 3)
++ /* properties of the boundary of the current CTB for the purposes
++ * of the deblocking filter */
++ unsigned int boundary_flags;
++
++#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
++ uint8_t ipm_left[IPM_TAB_SIZE];
++ uint8_t ipm_up[IPM_TAB_SIZE];
++
++//#define MVF_STASH_WIDTH 128
++#define MVF_STASH_WIDTH 64
++#define MVF_STASH_HEIGHT 64
++#define MVF_STASH_WIDTH_PU (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
++#define MVF_STASH_HEIGHT_PU (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
++ HEVCRpiMvField mvf_ul[1];
++ HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
++
++ /* +7 is for subpixel interpolation, *2 for high bit depths */
++// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++ /* The extended size between the new edge emu buffer is abused by SAO */
++// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++// DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
++
++} HEVCRpiLocalContext;
++
++// Each block can have an intra prediction and an add_residual command
++// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
++
++// Sand only has 2 planes (Y/C)
++#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
++
++// Command for intra prediction and transform_add of predictions to coefficients
++enum rpi_pred_cmd_e
++{
++ RPI_PRED_ADD_RESIDUAL,
++ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
++ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++ RPI_PRED_ADD_DC,
++ RPI_PRED_ADD_DC_U, // Both U & V are effectively C
++ RPI_PRED_ADD_DC_V,
++ RPI_PRED_INTRA,
++ RPI_PRED_INTRA_C,
++ RPI_PRED_I_PCM,
++ RPI_PRED_CMD_MAX
++};
++
++typedef struct HEVCPredCmd {
++ uint8_t type;
++ uint8_t size; // log2 "size" used by all variants
++ uint8_t avail; // i_pred - but left here as they pack well
++ uint8_t dummy;
++ union {
++ struct { // TRANSFORM_ADD
++ uint8_t * dst;
++ const int16_t * buf;
++ uint16_t stride; // Should be good enough for all pic fmts we use
++ int16_t dc;
++ } ta;
++ struct {
++ uint8_t * dst;
++ uint32_t stride;
++ int dc;
++ } dc;
++ struct { // INTRA
++ uint16_t x;
++ uint16_t y;
++ enum IntraPredMode mode;
++ } i_pred;
++ struct { // I_PCM
++ uint16_t x;
++ uint16_t y;
++ const void * src;
++ uint32_t src_len;
++ } i_pcm;
++ };
++} HEVCPredCmd;
++
++union qpu_mc_pred_cmd_s;
++struct qpu_mc_pred_y_p_s;
++struct qpu_mc_src_s;
++
++typedef struct HEVCRpiInterPredQ
++{
++ union qpu_mc_pred_cmd_u *qpu_mc_base;
++ union qpu_mc_pred_cmd_u *qpu_mc_curr;
++ struct qpu_mc_src_s *last_l0;
++ struct qpu_mc_src_s *last_l1;
++ unsigned int load;
++ uint32_t code_setup;
++ uint32_t code_sync;
++ uint32_t code_exit;
++} HEVCRpiInterPredQ;
++
++typedef struct HEVCRpiInterPredEnv
++{
++ HEVCRpiInterPredQ * q;
++ uint8_t n; // Number of Qs
++ uint8_t n_grp; // Number of Q in a group
++ uint8_t curr; // Current Q number (0..n-1)
++ uint8_t used; // 0 if nothing in any Q, 1 otherwise
++ uint8_t used_grp; // 0 if nothing in any Q in the current group
++ unsigned int max_fill;
++ unsigned int min_gap;
++ GPU_MEM_PTR_T gptr;
++} HEVCRpiInterPredEnv;
++
++typedef struct HEVCRpiIntraPredEnv {
++ unsigned int n; // Number of commands
++ HEVCPredCmd * cmds;
++} HEVCRpiIntraPredEnv;
++
++typedef struct HEVCRpiCoeffEnv {
++ unsigned int n;
++#if RPI_COMPRESS_COEFFS
++ unsigned int packed; // Equal to 1 if coefficients should be being packed
++ unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed). Only valid if packed==0
++#endif
++ int16_t * buf;
++} HEVCRpiCoeffEnv;
++
++typedef struct HEVCRpiCoeffsEnv {
++ HEVCRpiCoeffEnv s[4];
++ GPU_MEM_PTR_T gptr;
++ void * mptr;
++} HEVCRpiCoeffsEnv;
++
++typedef struct HEVCRpiFrameProgressWait {
++ int req;
++ struct HEVCRpiFrameProgressWait * next;
++ sem_t sem;
++} HEVCRpiFrameProgressWait;
++
++typedef struct HEVCRpiFrameProgressState {
++ struct HEVCRpiFrameProgressWait * first;
++ struct HEVCRpiFrameProgressWait * last;
++ pthread_mutex_t lock;
++} HEVCRpiFrameProgressState;
++
++typedef struct RpiBlk
++{
++ unsigned int x;
++ unsigned int y;
++ unsigned int w;
++ unsigned int h;
++} RpiBlk;
++
++typedef struct HEVCRpiJob {
++ struct HEVCRpiJob * next; // Free chain
++ struct HEVCRpiJobCtl * jbc_local;
++ const HEVCRpiSPS * sps; // sps used to set up this job
++
++ int waited;
++ int ctu_ts_first;
++ int ctu_ts_last;
++ RpiBlk bounds; // Bounding box of job
++
++ struct qpu_mc_pred_y_p_s * last_y8_p;
++ struct qpu_mc_src_s * last_y8_l1;
++ rpi_cache_flush_env_t * rfe;
++
++ HEVCRpiInterPredEnv chroma_ip;
++ HEVCRpiInterPredEnv luma_ip;
++ int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
++ HEVCRpiIntraPredEnv intra;
++ HEVCRpiCoeffsEnv coeffs;
++ HEVCRpiFrameProgressWait progress_wait;
++ sem_t sem;
++ rpi_cache_buf_t flush_buf;
++} HEVCRpiJob;
++
++struct HEVCRpiContext;
++
++typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
++
++typedef struct HEVCRpiPassQueue
++{
++// int pending;
++ volatile int terminate;
++ sem_t sem_in;
++ sem_t * psem_out;
++ unsigned int job_n;
++ struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
++ HEVCRpiWorkerFn * worker;
++ pthread_t thread;
++ uint8_t pass_n; // Pass number - debug
++ uint8_t started;
++} HEVCRpiPassQueue;
++
++
++struct HEVCRpiJobGlobal;
++
++typedef struct HEVCRpiJobCtl
++{
++ sem_t sem_out;
++
++ HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated
++ struct HEVCRpiJobGlobal * jbg;
++
++ HEVCRpiLocalContext * lcw_head;
++ HEVCRpiLocalContext * lcw_tail;
++
++ pthread_mutex_t in_lock;
++ int offload_in;
++
++ HEVCRpiJob *offloadq[RPI_MAX_JOBS];
++} HEVCRpiJobCtl;
++
++
++typedef struct HEVCRpiJobGlobal
++{
++ intptr_t ref_count;
++ pthread_mutex_t lock;
++ HEVCRpiJob * free1; // Singly linked list of free jobs
++ HEVCRpiLocalContext * wait_head; // Double linked list of lcs waiting for a job
++ HEVCRpiLocalContext * wait_good; // Last good tail
++ HEVCRpiLocalContext * wait_tail;
++
++} HEVCRpiJobGlobal;
++
++#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
++
++#if RPI_TSTATS
++typedef struct HEVCRpiStats {
++ int y_pred1_y8_merge;
++ int y_pred1_xy;
++ int y_pred1_x0;
++ int y_pred1_y0;
++ int y_pred1_x0y0;
++ int y_pred1_wle8;
++ int y_pred1_wgt8;
++ int y_pred1_hle16;
++ int y_pred1_hgt16;
++ int y_pred2_xy;
++ int y_pred2_x0;
++ int y_pred2_y0;
++ int y_pred2_x0y0;
++ int y_pred2_hle16;
++ int y_pred2_hgt16;
++} HEVCRpiStats;
++#endif
++
++typedef struct HEVCRpiCabacState
++{
++ uint8_t rice[4];
++ uint8_t state[HEVC_CONTEXTS];
++} HEVCRpiCabacState;
++
++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels
++#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1)
++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte
++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el
++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row
++#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++
++typedef struct HEVCRpiContext {
++ const AVClass *c; // needed by private avoptions
++ AVCodecContext *avctx;
++
++ uint8_t threads_type;
++ char qpu_init_ok;
++
++ /** 1 if the independent slice segment header was successfully parsed */
++ uint8_t slice_initialized;
++ char used_for_ref; // rpi
++ char is_irap;
++ char offload_recon;
++ uint8_t eos; ///< current packet contains an EOS/EOB NAL
++ uint8_t last_eos; ///< last packet contains an EOS/EOB NAL
++ uint8_t no_backward_pred_flag;
++ uint8_t is_decoded;
++ uint8_t no_rasl_output_flag;
++
++
++ /**
++ * Sequence counters for decoded and output frames, so that old
++ * frames are output first after a POC reset
++ */
++ uint16_t seq_decode;
++ uint16_t seq_output;
++
++ int width;
++ int height;
++
++ HEVCRpiJobCtl * jbc;
++ // cabac stash
++ // b0 skip flag
++ // b1+ ct_depth
++ uint8_t * cabac_stash_left;
++ uint8_t * cabac_stash_up;
++
++ // Function pointers
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++ const uint8_t * qpu_dummy_frame_emu;
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory
++#endif
++ HEVCRpiQpu qpu;
++
++ HEVCRpiFrameProgressState progress_states[2];
++
++ HEVCRpiCabacState *cabac_save;
++
++ AVFrame *frame;
++ AVFrame *output_frame;
++ uint8_t *sao_pixel_buffer_h[3];
++ uint8_t *sao_pixel_buffer_v[3];
++
++ unsigned int col_mvf_stride;
++ AVBufferPool *col_mvf_pool;
++
++ RpiSAOParams *sao;
++ DBParams *deblock;
++ enum HEVCNALUnitType nal_unit_type;
++ int temporal_id; ///< temporal_id_plus1 - 1
++ HEVCRpiFrame *ref;
++ int poc;
++ int pocTid0;
++ int slice_idx; ///< number of the slice being currently decoded
++ int max_ra;
++
++ int8_t *qp_y_tab;
++
++ // Deblocking block strength bitmaps
++ unsigned int bs_stride2;
++ unsigned int bs_size;
++ uint8_t *bs_horizontal;
++ uint8_t *bs_vertical;
++ uint8_t *bsf_stash_up;
++ uint8_t *bsf_stash_left;
++
++#if HEVC_RPI_MAX_CTBS >= 0xffff
++#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
++ uint32_t *tab_slice_address;
++#else
++#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
++ uint16_t *tab_slice_address;
++#endif
++
++ // Bitfield 1 bit per 8 pels (min pcm size)
++ uint8_t *is_pcm;
++ // Bitfield 1 bit per 8 pels (min cb size)
++ // Only needed for CIP as CIP processing is async to the main thread
++ uint8_t *is_intra;
++
++ // PU
++ HEVCRpiMvField *mvf_up;
++ HEVCRpiMvField *mvf_left;
++
++ const RefPicList **rpl_up;
++ const RefPicList **rpl_left;
++ RefPicList * refPicList;
++
++ // CTB-level flags affecting loop filter operation
++ uint8_t *filter_slice_edges;
++
++ /** used on BE to byteswap the lines for checksumming */
++ uint8_t *checksum_buf;
++ int checksum_buf_size;
++
++ const uint8_t *data;
++
++ H2645Packet pkt;
++ // type of the first VCL NAL of the current frame
++ enum HEVCNALUnitType first_nal_type;
++
++ uint8_t context_initialized;
++ int is_nalff; ///< this flag is != 0 if bitstream is encapsulated
++ ///< as a format defined in 14496-15
++ int apply_defdispwin;
++
++ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
++ int nuh_layer_id;
++
++ struct AVMD5 *md5_ctx;
++
++ RefPicListTab * rpl_tab;
++ unsigned int rpl_tab_size;
++
++ uint8_t *is_intra_store;
++
++ RpiSliceHeader sh;
++
++ HEVCRpiParamSets ps;
++
++ HEVCRpiLocalContext *HEVClc;
++ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS];
++
++ HEVCRpiFrame DPB[HEVC_DPB_ELS];
++
++ ///< candidate references for the current frame
++ RefPicList rps[5];
++
++ HEVCRpiPredContext hpc;
++ HEVCDSPContext hevcdsp;
++
++ HEVCSEIContext sei;
++
++ // Put structures that allocate non-trivial storage at the end
++ // These are mostly used indirectly so position in the structure doesn't matter
++ HEVCRpiPassQueue passq[RPI_PASSES];
++#if RPI_EXTRA_BIT_THREADS > 0
++ int bt_started;
++ // This simply contains thread descriptors - task setup is held elsewhere
++ pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
++#endif
++#if RPI_TSTATS
++ HEVCRpiStats tstats;
++#endif
++} HEVCRpiContext;
++
++/**
++ * Mark all frames in DPB as unused for reference.
++ */
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
++
++/**
++ * Drop all frames currently in DPB.
++ */
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture sets for the current frame.
++ */
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture list(s) for the current slice.
++ */
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
++
++
++/**
++ * Get the number of candidate references for the current frame.
++ */
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
++
++/**
++ * Find next frame in output order and put a reference to it in frame.
++ * @return 1 if a frame was output, 0 otherwise
++ */
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
++
++unsigned int ff_hevc_rpi_tb_avail_flags(
++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
++
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++ int nPbH, int log2_cb_size, int part_idx,
++ int merge_idx, HEVCRpiMvField * const mv);
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int nPbW, const unsigned int nPbH,
++ const unsigned int avail,
++ HEVCRpiMvField * const mv,
++ const unsigned int mvp_lx_flag, const unsigned int LX);
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int log2_trafo_size, const int is_coded_block);
++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
++
++extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra[4];
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
++
++// arm/hevc_misc_neon.S
++// Neon coeff zap fn
++#if HAVE_NEON
++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
++#endif
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCRpiFrame * const ref, const int val, const int field);
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
++
++// All of these expect that s->threads_type == FF_THREAD_FRAME
++
++static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCRpiFrame * const ref, const int y)
++{
++ if (s->threads_type != 0)
++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
++{
++ if (s->used_for_ref && s->threads_type != 0)
++ ff_hevc_rpi_progress_signal_field(s, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCRpiFrame * const ref, const int y)
++{
++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
++}
++
++static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
++{
++ if (s->used_for_ref && s->threads_type != 0)
++ {
++ ff_hevc_rpi_progress_signal_field(s, y, 0);
++ }
++}
++
++static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
++{
++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
++}
++
++
++// Set all done - signal nothing (used in missing refs)
++// Works for both rpi & non-rpi
++static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
++{
++ if (ref->tf.progress != NULL)
++ {
++ int * const p = (int *)ref->tf.progress->data;
++ p[0] = INT_MAX;
++ p[1] = INT_MAX;
++ }
++}
++
++#define HEVC_RPI_420_ONLY 1
++#define HEVC_RPI_SAND128_ONLY 1
++
++static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++ return cidx == 0 ? 0 : 1;
++#else
++ return s->ps.sps->hshift[cidx];
++#endif
++}
++
++static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++ return cidx == 0 ? 0 : 1;
++#else
++ return s->ps.sps->vshift[cidx];
++#endif
++}
++
++static inline int ctx_cfmt(const HEVCRpiContext * const s)
++{
++#if HEVC_RPI_420_ONLY
++ return 1;
++#else
++ return s->ps.sps->chroma_format_idc;
++#endif
++}
++
++static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
++{
++#if HEVC_RPI_SAND128_ONLY
++ return 128;
++#else
++ return frame->linesize[c_idx];
++#endif
++}
++
++#if HEVC_RPI_SAND128_ONLY
++// Propagate this decision to later zc includes
++#define RPI_ZC_SAND128_ONLY 1
++#endif
++
++#ifndef ff_hevc_rpi_copy_vert
++static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
++ int pixel_shift, int height,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++ int i;
++ switch (pixel_shift)
++ {
++ case 2:
++ for (i = 0; i < height; i++) {
++ *(uint32_t *)dst = *(uint32_t *)src;
++ dst += stride_dst;
++ src += stride_src;
++ }
++ break;
++ case 1:
++ for (i = 0; i < height; i++) {
++ *(uint16_t *)dst = *(uint16_t *)src;
++ dst += stride_dst;
++ src += stride_src;
++ }
++ break;
++ default:
++ for (i = 0; i < height; i++) {
++ *dst = *src;
++ dst += stride_dst;
++ src += stride_src;
++ }
++ break;
++ }
++}
++#endif
++
++
++#if MVF_STASH_WIDTH == 64
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int x, const unsigned int y)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++ const unsigned int x0_ctb = x0 & mask_cs_hi;
++ const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++ return (HEVCRpiMvField *)((y < y0_ctb) ?
++ (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
++ (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
++ lc->mvf_stash +
++ ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
++ ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++ const unsigned int x0,
++ const unsigned int x)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++ const unsigned int x0_ctb = x0 & mask_cs_hi;
++ return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
++}
++
++#else
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x, const unsigned int y)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++ const unsigned int x0, const unsigned int y0,
++ const unsigned int x, const unsigned int y)
++{
++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++
++ const unsigned int x0_ctb = x0 & mask_cs_hi;
++ const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++ // If not in the same CTB for Y assume up
++ if (y < y0_ctb) {
++ // If not in the same CTB for X too assume up-left
++ return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
++ }
++ return mvf_stash_ptr(s, lc, x, y);
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++ const unsigned int x0,
++ const unsigned int x)
++{
++ return MVF_STASH_WIDTH_PU;
++}
++#endif
++
++#endif /* AVCODEC_RPI_HEVCDEC_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.c
+@@ -0,0 +1,450 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdsp.h"
++#include "rpi_hevc_mv.h"
++
++static const int8_t transform[32][32] = {
++ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
++ { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
++ -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
++ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90,
++ -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 },
++ { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
++ 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 },
++ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89,
++ 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 },
++ { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
++ -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 },
++ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87,
++ -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 },
++ { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
++ 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 },
++ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83,
++ 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 },
++ { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
++ -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 },
++ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80,
++ -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 },
++ { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
++ 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 },
++ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75,
++ 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 },
++ { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
++ -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 },
++ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70,
++ -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 },
++ { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
++ 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 },
++ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64,
++ 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 },
++ { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
++ -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 },
++ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57,
++ -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 },
++ { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
++ 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 },
++ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50,
++ 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 },
++ { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
++ -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 },
++ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43,
++ -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 },
++ { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
++ 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 },
++ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36,
++ 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 },
++ { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
++ -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 },
++ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25,
++ -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 },
++ { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
++ 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 },
++ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18,
++ 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 },
++ { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
++ -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 },
++ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9,
++ -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 },
++ { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90,
++ 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 },
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
++ { -2, 58, 10, -2},
++ { -4, 54, 16, -2},
++ { -6, 46, 28, -4},
++ { -4, 36, 36, -4},
++ { -4, 28, 46, -6},
++ { -2, 16, 54, -4},
++ { -2, 10, 58, -2},
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
++ { -1, 4,-10, 58, 17, -5, 1, 0, -1, 4,-10, 58, 17, -5, 1, 0},
++ { -1, 4,-11, 40, 40,-11, 4, -1, -1, 4,-11, 40, 40,-11, 4, -1},
++ { 0, 1, -5, 17, 58,-10, 4, -1, 0, 1, -5, 17, 58,-10, 4, -1}
++};
++
++#define BIT_DEPTH 8
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ int in_inc0, int in_inc1)
++{
++ int shift = 32;
++ uint32_t bs = 0;
++ for (; pus > 0; pus--) {
++ int strength, out;
++ int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
++ int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
++ int nr_idx0 = neigh->ref_idx[0];
++ int nr_idx1 = neigh->ref_idx[1];
++ int neigh_refL0 = neigh_rpl0[nr_idx0];
++ int neigh_refL1 = neigh_rpl1[nr_idx1];
++
++ av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
++ av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
++
++#if 1 // This more directly matches the original implementation
++ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
++ // same L0 and L1
++ if (curr_refL0 == neigh_refL0 &&
++ curr_refL0 == curr_refL1 &&
++ neigh_refL0 == neigh_refL1) {
++ if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
++ (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
++ strength = 1;
++ else
++ strength = 0;
++ } else if (neigh_refL0 == curr_refL0 &&
++ neigh_refL1 == curr_refL1) {
++ if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
++ strength = 1;
++ else
++ strength = 0;
++ } else if (neigh_refL1 == curr_refL0 &&
++ neigh_refL0 == curr_refL1) {
++ if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
++ strength = 1;
++ else
++ strength = 0;
++ } else {
++ strength = 1;
++ }
++ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++ MvXY curr_mv0, neigh_mv0;
++
++ if (curr->pred_flag & 1) {
++ curr_mv0 = curr->xy[0];
++ } else {
++ curr_mv0 = curr->xy[1];
++ curr_refL0 = curr_refL1;
++ }
++
++ if (neigh->pred_flag & 1) {
++ neigh_mv0 = neigh->xy[0];
++ } else {
++ neigh_mv0 = neigh->xy[1];
++ neigh_refL0 = neigh_refL1;
++ }
++
++ if (curr_refL0 == neigh_refL0) {
++ if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
++ strength = 1;
++ else
++ strength = 0;
++ } else
++ strength = 1;
++ } else
++ strength = 1;
++#else // This has exactly the same effect, but is more suitable for vectorisation
++ MvXY curr_mv[2];
++ MvXY neigh_mv[2];
++ memcpy(curr_mv, curr->xy, sizeof curr_mv);
++ memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
++
++ if (!(curr->pred_flag & 2)) {
++ curr_mv[1] = curr_mv[0];
++ curr_refL1 = curr_refL0;
++ }
++ if (!(neigh->pred_flag & 2)) {
++ neigh_mv[1] = neigh_mv[0];
++ neigh_refL1 = neigh_refL0;
++ }
++ if (!(curr->pred_flag & 1)) {
++ curr_mv[0] = curr_mv[1];
++ curr_refL0 = curr_refL1;
++ }
++ if (!(neigh->pred_flag & 1)) {
++ neigh_mv[0] = neigh_mv[1];
++ neigh_refL0 = neigh_refL1;
++ }
++
++ strength = 1;
++
++ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
++
++ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
++
++ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
++#endif
++
++ curr += in_inc0 / sizeof (HEVCRpiMvField);
++ neigh += in_inc1 / sizeof (HEVCRpiMvField);
++
++ for (out = dup; out > 0; out--)
++ {
++ bs = (bs >> 2) | (strength << 30);
++ shift -= 2;
++ }
++ }
++ return bs >> shift;
++}
++
++
++static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
++{
++ unsigned int i, j;
++
++ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
++ for (i = 0; i < height; i++) {
++ for (j = 0; j < width; j+=8)
++ AV_COPY64U(dst+j, src+j);
++ dst += stride_dst;
++ src += stride_src;
++ }
++ } else {
++ for (i = 0; i < height; i++) {
++ for (j = 0; j < width; j+=16)
++ AV_COPY128(dst+j, src+j);
++ dst += stride_dst;
++ src += stride_src;
++ }
++ }
++}
++
++
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef PEL_FUNC
++#define PEL_FUNC(dst1, idx1, idx2, a, depth) \
++ for(i = 0 ; i < 10 ; i++) \
++{ \
++ hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth; \
++}
++
++#undef EPEL_FUNCS
++#define EPEL_FUNCS(depth) \
++ PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth); \
++ PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth); \
++ PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth); \
++ PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
++
++#undef EPEL_UNI_FUNCS
++#define EPEL_UNI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth); \
++ PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth); \
++ PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
++
++#undef EPEL_BI_FUNCS
++#define EPEL_BI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth); \
++ PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth); \
++ PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
++
++#undef QPEL_FUNCS
++#define QPEL_FUNCS(depth) \
++ PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth); \
++ PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth); \
++ PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
++
++#undef QPEL_UNI_FUNCS
++#define QPEL_UNI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth); \
++ PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth); \
++ PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
++
++#undef QPEL_BI_FUNCS
++#define QPEL_BI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth); \
++ PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth); \
++ PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++
++#define SLICED_ADD_RESIDUAL(depth)\
++ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \
++ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \
++ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \
++ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \
++ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \
++ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \
++ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \
++ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \
++ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \
++ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \
++ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \
++ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \
++ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \
++ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \
++ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \
++ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \
++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth)
++#define SLICED_LOOP_FILTERS(depth)\
++ hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
++ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \
++ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth)
++#define SLICED_SAO(depth)\
++ for (i = 0; i != SAO_FILTER_N; ++i) { \
++ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \
++ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \
++ } \
++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \
++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
++
++#define HEVC_DSP(depth) \
++ hevcdsp->put_pcm = FUNC(put_pcm, depth); \
++ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \
++ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \
++ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \
++ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \
++ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \
++ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \
++ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \
++ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \
++ SLICED_ADD_RESIDUAL(depth); \
++ hevcdsp->dequant = FUNC(dequant, depth); \
++ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \
++ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \
++ hevcdsp->idct[0] = FUNC(idct_4x4, depth); \
++ hevcdsp->idct[1] = FUNC(idct_8x8, depth); \
++ hevcdsp->idct[2] = FUNC(idct_16x16, depth); \
++ hevcdsp->idct[3] = FUNC(idct_32x32, depth); \
++ \
++ hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \
++ hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \
++ hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \
++ hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \
++ \
++ for (i = 0; i != SAO_FILTER_N; ++i) { \
++ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \
++ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \
++ } \
++ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \
++ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \
++ SLICED_SAO(depth); \
++ \
++ QPEL_FUNCS(depth); \
++ QPEL_UNI_FUNCS(depth); \
++ QPEL_BI_FUNCS(depth); \
++ EPEL_FUNCS(depth); \
++ EPEL_UNI_FUNCS(depth); \
++ EPEL_BI_FUNCS(depth); \
++ \
++ SLICED_LOOP_FILTERS(depth); \
++ hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \
++ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \
++ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \
++ hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth); \
++ hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth); \
++ hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth); \
++ hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
++ hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
++int i = 0;
++
++ switch (bit_depth) {
++ case 9:
++ HEVC_DSP(9);
++ break;
++ case 10:
++ HEVC_DSP(10);
++ break;
++ case 12:
++ HEVC_DSP(12);
++ break;
++ default:
++ HEVC_DSP(8);
++ break;
++ }
++
++ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
++ hevcdsp->cpy_blk = cpy_blk;
++
++ if (ARCH_PPC)
++ ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
++ if (ARCH_X86)
++ ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
++ if (ARCH_ARM)
++ ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
++ if (ARCH_MIPS)
++ ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.h
+@@ -0,0 +1,177 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ *
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDSP_H
++#define AVCODEC_RPI_HEVCDSP_H
++
++#include "hevc.h"
++#include "get_bits.h"
++
++struct HEVCRpiMvField;
++
++#define MAX_PB_SIZE 64
++
++#define RPI_HEVC_SAO_BUF_STRIDE 160
++
++
++typedef struct RpiSAOParams {
++ uint8_t band_position[3]; ///< sao_band_position (Y,U,V)
++ uint8_t eo_class[3]; ///< sao_eo_class (Y,U=V)
++ uint8_t type_idx[3]; ///< sao_type_idx (Y,U=V)
++
++ int16_t offset_val[3][5]; ///<SaoOffsetVal (Y,U,V)
++
++} RpiSAOParams;
++
++
++// This controls how many sao dsp functions there are
++// N=5 has width = 8, 16, 32, 48, 64
++// N=6 adds a function for width=24 (in fn array el 5 so existing code should
++// still work)
++#define SAO_FILTER_N 6
++
++
++typedef struct HEVCDSPContext {
++ void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++ struct GetBitContext *gb, int pcm_bit_depth);
++
++ void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++ void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
++ void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
++ void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
++
++ void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++ void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
++ void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++ struct GetBitContext *gb, int pcm_bit_depth);
++
++ void (*dequant)(int16_t *coeffs, int16_t log2_size);
++
++ void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
++
++ void (*transform_4x4_luma)(int16_t *coeffs);
++
++ void (*idct[4])(int16_t *coeffs, int col_limit);
++
++ void (*idct_dc[4])(int16_t *coeffs);
++
++ void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++ int16_t *sao_offset_val, int sao_left_class, int width, int height);
++ void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height);
++
++ /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
++ void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++ int16_t *sao_offset_val, int sao_eo_class, int width, int height);
++ void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++
++ void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++ struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
++ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++ void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++ struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
++ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++
++ void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++
++ void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++ int height, intptr_t mx, intptr_t my, int width);
++
++ void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width);
++ void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int ox0, int wx1,
++ int ox1, intptr_t mx, intptr_t my, int width);
++
++ void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc,
++ uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc,
++ uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc,
++ uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc,
++ uint8_t *no_p, uint8_t *no_q);
++ void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q);
++ void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q);
++ void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++ void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++ uint8_t * _pix_l);
++ void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
++ unsigned int no_f);
++ void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++ uint8_t * src_l,
++ unsigned int no_f);
++
++ uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ int in_inc0, int inc_inc1);
++
++ void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
++} HEVCDSPContext;
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
++
++extern const int8_t ff_hevc_rpi_epel_filters[7][4];
++extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
++
++void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
++void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
++#endif /* AVCODEC_RPI_HEVCDSP_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp_template.c
+@@ -0,0 +1,2279 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "get_bits.h"
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++#include "rpi_hevcdsp.h"
++
++#include "rpi_hevc_shader_template.h"
++
++static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++ GetBitContext *gb, int pcm_bit_depth)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++ dst += stride;
++ }
++}
++
++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++ GetBitContext *gb, int pcm_bit_depth)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++ dst += stride;
++ }
++
++ dst = (pixel *)_dst + 1;
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++ dst += stride;
++ }
++}
++
++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ dst[x] = av_clip_pixel(dst[x] + *res);
++ res++;
++ }
++ dst += stride;
++ }
++}
++
++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ dst[x] = av_clip_pixel(dst[x] + dc);
++ }
++ dst += stride;
++ }
++}
++
++
++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
++ ptrdiff_t stride, const int dc_v, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size * 2; x += 2) {
++ dst[x] = av_clip_pixel(dst[x] + *res);
++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++ res++;
++ }
++ dst += stride;
++ }
++}
++
++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
++ ptrdiff_t stride, const int dc_u, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size * 2; x += 2) {
++ dst[x] = av_clip_pixel(dst[x] + dc_u);
++ dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
++ res++;
++ }
++ dst += stride;
++ }
++}
++
++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
++ ptrdiff_t stride, unsigned int size)
++{
++ unsigned int x, y;
++ pixel *dst = (pixel *)_dst;
++ const int16_t * ru = res;
++ const int16_t * rv = res + size * size;
++
++// rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
++// rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
++// rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size * 2; x += 2) {
++ dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
++ dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
++ }
++ dst += stride;
++ }
++
++// rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
++}
++
++
++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++ const int dc_v = dc >> 16;
++ const int dc_u = (dc << 16) >> 16;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size * 2; x += 2) {
++ dst[x] = av_clip_pixel(dst[x] + dc_u);
++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++ }
++ dst += stride;
++ }
++}
++
++
++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 32);
++}
++
++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 32);
++}
++
++// -- U -- (plaited)
++
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
++}
++
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
++}
++
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
++}
++
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++// -- V -- (plaited)
++
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++// -- C -- (plaited - both U & V)
++
++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual_c)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual_c)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual_c)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++
++static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++{
++ int16_t *coeffs = (int16_t *) _coeffs;
++ int x, y;
++ int size = 1 << log2_size;
++
++ if (mode) {
++ coeffs += size;
++ for (y = 0; y < size - 1; y++) {
++ for (x = 0; x < size; x++)
++ coeffs[x] += coeffs[x - size];
++ coeffs += size;
++ }
++ } else {
++ for (y = 0; y < size; y++) {
++ for (x = 1; x < size; x++)
++ coeffs[x] += coeffs[x - 1];
++ coeffs += size;
++ }
++ }
++}
++
++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
++{
++ int shift = 15 - BIT_DEPTH - log2_size;
++ int x, y;
++ int size = 1 << log2_size;
++
++ if (shift > 0) {
++ int offset = 1 << (shift - 1);
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ *coeffs = (*coeffs + offset) >> shift;
++ coeffs++;
++ }
++ }
++ } else {
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ *coeffs = *coeffs << -shift;
++ coeffs++;
++ }
++ }
++ }
++}
++
++#define SET(dst, x) (dst) = (x)
++#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
++
++#define TR_4x4_LUMA(dst, src, step, assign) \
++ do { \
++ int c0 = src[0 * step] + src[2 * step]; \
++ int c1 = src[2 * step] + src[3 * step]; \
++ int c2 = src[0 * step] - src[3 * step]; \
++ int c3 = 74 * src[1 * step]; \
++ \
++ assign(dst[2 * step], 74 * (src[0 * step] - \
++ src[2 * step] + \
++ src[3 * step])); \
++ assign(dst[0 * step], 29 * c0 + 55 * c1 + c3); \
++ assign(dst[1 * step], 55 * c2 - 29 * c1 + c3); \
++ assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \
++ } while (0)
++
++static void FUNC(transform_4x4_luma)(int16_t *coeffs)
++{
++ int i;
++ int shift = 7;
++ int add = 1 << (shift - 1);
++ int16_t *src = coeffs;
++
++ for (i = 0; i < 4; i++) {
++ TR_4x4_LUMA(src, src, 4, SCALE);
++ src++;
++ }
++
++ shift = 20 - BIT_DEPTH;
++ add = 1 << (shift - 1);
++ for (i = 0; i < 4; i++) {
++ TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
++ coeffs += 4;
++ }
++}
++
++#undef TR_4x4_LUMA
++
++#define TR_4(dst, src, dstep, sstep, assign, end) \
++ do { \
++ const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
++ const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
++ const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
++ const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
++ \
++ assign(dst[0 * dstep], e0 + o0); \
++ assign(dst[1 * dstep], e1 + o1); \
++ assign(dst[2 * dstep], e1 - o1); \
++ assign(dst[3 * dstep], e0 - o0); \
++ } while (0)
++
++#define TR_8(dst, src, dstep, sstep, assign, end) \
++ do { \
++ int i, j; \
++ int e_8[4]; \
++ int o_8[4] = { 0 }; \
++ for (i = 0; i < 4; i++) \
++ for (j = 1; j < end; j += 2) \
++ o_8[i] += transform[4 * j][i] * src[j * sstep]; \
++ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \
++ \
++ for (i = 0; i < 4; i++) { \
++ assign(dst[i * dstep], e_8[i] + o_8[i]); \
++ assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \
++ } \
++ } while (0)
++
++#define TR_16(dst, src, dstep, sstep, assign, end) \
++ do { \
++ int i, j; \
++ int e_16[8]; \
++ int o_16[8] = { 0 }; \
++ for (i = 0; i < 8; i++) \
++ for (j = 1; j < end; j += 2) \
++ o_16[i] += transform[2 * j][i] * src[j * sstep]; \
++ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \
++ \
++ for (i = 0; i < 8; i++) { \
++ assign(dst[i * dstep], e_16[i] + o_16[i]); \
++ assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \
++ } \
++ } while (0)
++
++#define TR_32(dst, src, dstep, sstep, assign, end) \
++ do { \
++ int i, j; \
++ int e_32[16]; \
++ int o_32[16] = { 0 }; \
++ for (i = 0; i < 16; i++) \
++ for (j = 1; j < end; j += 2) \
++ o_32[i] += transform[j][i] * src[j * sstep]; \
++ TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \
++ \
++ for (i = 0; i < 16; i++) { \
++ assign(dst[i * dstep], e_32[i] + o_32[i]); \
++ assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \
++ } \
++ } while (0)
++
++#define IDCT_VAR4(H) \
++ int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR8(H) \
++ int limit = FFMIN(col_limit, H); \
++ int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR16(H) IDCT_VAR8(H)
++#define IDCT_VAR32(H) IDCT_VAR8(H)
++
++#define IDCT(H) \
++static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \
++ int col_limit) \
++{ \
++ int i; \
++ int shift = 7; \
++ int add = 1 << (shift - 1); \
++ int16_t *src = coeffs; \
++ IDCT_VAR ## H(H); \
++ \
++ for (i = 0; i < H; i++) { \
++ TR_ ## H(src, src, H, H, SCALE, limit2); \
++ if (limit2 < H && i%4 == 0 && !!i) \
++ limit2 -= 4; \
++ src++; \
++ } \
++ \
++ shift = 20 - BIT_DEPTH; \
++ add = 1 << (shift - 1); \
++ for (i = 0; i < H; i++) { \
++ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \
++ coeffs += H; \
++ } \
++}
++
++#define IDCT_DC(H) \
++static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs) \
++{ \
++ int i, j; \
++ int shift = 14 - BIT_DEPTH; \
++ int add = 1 << (shift - 1); \
++ int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \
++ \
++ for (j = 0; j < H; j++) { \
++ for (i = 0; i < H; i++) { \
++ coeffs[i + j * H] = coeff; \
++ } \
++ } \
++}
++
++IDCT( 4)
++IDCT( 8)
++IDCT(16)
++IDCT(32)
++
++IDCT_DC( 4)
++IDCT_DC( 8)
++IDCT_DC(16)
++IDCT_DC(32)
++
++#undef TR_4
++#undef TR_8
++#undef TR_16
++#undef TR_32
++
++#undef SET
++#undef SCALE
++
++static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class,
++ int width, int height)
++{
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int offset_table[32] = { 0 };
++ int k, y, x;
++ int shift = BIT_DEPTH - 5;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++
++ for (k = 0; k < 4; k++)
++ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
++ dst += stride_dst;
++ src += stride_src;
++ }
++}
++
++#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
++
++static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
++ int eo, int width, int height) {
++
++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++ static const int8_t pos[4][2][2] = {
++ { { -1, 0 }, { 1, 0 } }, // horizontal
++ { { 0, -1 }, { 0, 1 } }, // vertical
++ { { -1, -1 }, { 1, 1 } }, // 45 degree
++ { { 1, -1 }, { -1, 1 } }, // 135 degree
++ };
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int a_stride, b_stride;
++ int x, y;
++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
++ stride_dst /= sizeof(pixel);
++
++ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
++ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ int diff0 = CMP(src[x], src[x + a_stride]);
++ int diff1 = CMP(src[x], src[x + b_stride]);
++ int offset_val = edge_idx[2 + diff0 + diff1];
++ dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
++ }
++ src += stride_src;
++ dst += stride_dst;
++ }
++}
++
++
++#if BIT_DEPTH == 10
++// We need a 32 bit variation for the _c restores so hijack bit depth 10
++#undef pixel
++#undef BIT_DEPTH
++#define pixel uint32_t
++#define BIT_DEPTH 32
++// All 16 bit variations are the same
++#define sao_edge_restore_0_10 sao_edge_restore_0_9
++#define sao_edge_restore_1_10 sao_edge_restore_1_9
++#define sao_edge_restore_0_11 sao_edge_restore_0_9
++#define sao_edge_restore_1_11 sao_edge_restore_1_9
++#define sao_edge_restore_0_12 sao_edge_restore_0_9
++#define sao_edge_restore_1_12 sao_edge_restore_1_9
++#define sao_edge_restore_0_13 sao_edge_restore_0_9
++#define sao_edge_restore_1_13 sao_edge_restore_1_9
++#define sao_edge_restore_0_14 sao_edge_restore_0_9
++#define sao_edge_restore_1_14 sao_edge_restore_1_9
++#define sao_edge_restore_0_15 sao_edge_restore_0_9
++#define sao_edge_restore_1_15 sao_edge_restore_1_9
++#define sao_edge_restore_0_16 sao_edge_restore_0_9
++#define sao_edge_restore_1_16 sao_edge_restore_1_9
++#endif
++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
++static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
++ int *borders, int _width, int _height,
++ int c_idx, uint8_t *vert_edge,
++ uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int sao_eo_class = sao->eo_class[c_idx];
++ int init_x = 0, width = _width, height = _height;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++
++ if (sao_eo_class != SAO_EO_VERT) {
++ if (borders[0]) {
++ for (y = 0; y < height; y++) {
++ dst[y * stride_dst] = src[y * stride_src];
++ }
++ init_x = 1;
++ }
++ if (borders[2]) {
++ int offset = width - 1;
++ for (x = 0; x < height; x++) {
++ dst[x * stride_dst + offset] = src[x * stride_src + offset];
++ }
++ width--;
++ }
++ }
++ if (sao_eo_class != SAO_EO_HORIZ) {
++ if (borders[1]) {
++ for (x = init_x; x < width; x++)
++ dst[x] = src[x];
++ }
++ if (borders[3]) {
++ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++ ptrdiff_t y_stride_src = stride_src * (height - 1);
++ for (x = init_x; x < width; x++)
++ dst[x + y_stride_dst] = src[x + y_stride_src];
++ height--;
++ }
++ }
++}
++
++static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
++ int *borders, int _width, int _height,
++ int c_idx, uint8_t *vert_edge,
++ uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int sao_eo_class = sao->eo_class[c_idx];
++ int init_x = 0, init_y = 0, width = _width, height = _height;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++
++ if (sao_eo_class != SAO_EO_VERT) {
++ if (borders[0]) {
++ for (y = 0; y < height; y++) {
++ dst[y * stride_dst] = src[y * stride_src];
++ }
++ init_x = 1;
++ }
++ if (borders[2]) {
++ int offset = width - 1;
++ for (x = 0; x < height; x++) {
++ dst[x * stride_dst + offset] = src[x * stride_src + offset];
++ }
++ width--;
++ }
++ }
++ if (sao_eo_class != SAO_EO_HORIZ) {
++ if (borders[1]) {
++ for (x = init_x; x < width; x++)
++ dst[x] = src[x];
++ init_y = 1;
++ }
++ if (borders[3]) {
++ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++ ptrdiff_t y_stride_src = stride_src * (height - 1);
++ for (x = init_x; x < width; x++)
++ dst[x + y_stride_dst] = src[x + y_stride_src];
++ height--;
++ }
++ }
++
++ {
++ int save_upper_left = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
++ int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D && !borders[1] && !borders[2];
++ int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
++ int save_lower_left = !diag_edge[3] && sao_eo_class == SAO_EO_45D && !borders[0] && !borders[3];
++
++ // Restore pixels that can't be modified
++ if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
++ for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
++ dst[y*stride_dst] = src[y*stride_src];
++ }
++ if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
++ for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
++ dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
++ }
++
++ if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
++ for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
++ dst[x] = src[x];
++ }
++ if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
++ for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
++ dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
++ }
++ if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
++ dst[0] = src[0];
++ if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
++ dst[width-1] = src[width-1];
++ if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
++ dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
++ if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
++ dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
++
++ }
++}
++#endif
++#if BIT_DEPTH == 32
++#undef BIT_DEPTH
++#undef pixel
++#define BIT_DEPTH 10
++#define pixel uint16_t
++#endif
++
++// --- Plaited chroma versions
++
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height)
++{
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int offset_table_u[32] = { 0 };
++ int offset_table_v[32] = { 0 };
++ int k, y, x;
++ int shift = BIT_DEPTH - 5;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++ width *= 2;
++
++ for (k = 0; k < 4; k++)
++ {
++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++ }
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x += 2)
++ {
++// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
++// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
++ // *** & 31 shouldn't be wanted but just now we generate broken input that
++ // crashes us in 10-bit world
++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
++ }
++ dst += stride_dst;
++ src += stride_src;
++ }
++}
++
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++ int eo, int width, int height) {
++
++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++ static const int8_t pos[4][2][2] = {
++ { { -1, 0 }, { 1, 0 } }, // horizontal
++ { { 0, -1 }, { 0, 1 } }, // vertical
++ { { -1, -1 }, { 1, 1 } }, // 45 degree
++ { { 1, -1 }, { -1, 1 } }, // 135 degree
++ };
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int a_stride, b_stride;
++ int x, y;
++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
++
++ stride_dst /= sizeof(pixel);
++ width *= 2;
++
++ av_assert0(width <= 64);
++
++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x += 2) {
++ int diff0u = CMP(src[x], src[x + a_stride]);
++ int diff1u = CMP(src[x], src[x + b_stride]);
++ int offset_valu = edge_idx[2 + diff0u + diff1u];
++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++ int offset_valv = edge_idx[2 + diff0v + diff1v];
++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
++ }
++ src += stride_src;
++ dst += stride_dst;
++ }
++}
++
++// Do once
++#if BIT_DEPTH == 8
++// Any old 2 byte 'normal' restore will work for these
++#define sao_edge_restore_c_0_8 sao_edge_restore_0_16
++#define sao_edge_restore_c_1_8 sao_edge_restore_1_16
++// We need 32 bit for 9 bit+
++#define sao_edge_restore_c_0_9 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_9 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
++#endif
++
++#undef CMP
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = src[x] << (14 - BIT_DEPTH);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ for (y = 0; y < height; y++) {
++ memcpy(dst, src, width * sizeof(pixel));
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
++ }
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define QPEL_FILTER(src, stride) \
++ (filter[0] * src[x - 3 * stride] + \
++ filter[1] * src[x - 2 * stride] + \
++ filter[2] * src[x - stride] + \
++ filter[3] * src[x ] + \
++ filter[4] * src[x + stride] + \
++ filter[5] * src[x + 2 * stride] + \
++ filter[6] * src[x + 3 * stride] + \
++ filter[7] * src[x + 4 * stride])
++
++static void FUNC(put_hevc_qpel_h)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_v)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
++ uint8_t *_src,
++ ptrdiff_t _srcstride,
++ int height, intptr_t mx,
++ intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++ tmp += MAX_PB_SIZE;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++ int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++
++static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox,
++ intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox,
++ intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox,
++ intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define EPEL_FILTER(src, stride) \
++ (filter[0] * src[x - stride] + \
++ filter[1] * src[x] + \
++ filter[2] * src[x + stride] + \
++ filter[3] * src[x + 2 * stride])
++
++static void FUNC(put_hevc_epel_h)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_v)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_hv)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++ tmp += MAX_PB_SIZE;
++ dst += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ }
++ dst += dststride;
++ src += srcstride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ dst += dststride;
++ src += srcstride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ }
++ dst += dststride;
++ src += srcstride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ }
++ dst += dststride;
++ src += srcstride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
++
++static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
++
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++// line zero
++#define P3 pix[-4 * xstride]
++#define P2 pix[-3 * xstride]
++#define P1 pix[-2 * xstride]
++#define P0 pix[-1 * xstride]
++#define Q0 pix[0 * xstride]
++#define Q1 pix[1 * xstride]
++#define Q2 pix[2 * xstride]
++#define Q3 pix[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix[-4 * xstride + 3 * ystride]
++#define TP2 pix[-3 * xstride + 3 * ystride]
++#define TP1 pix[-2 * xstride + 3 * ystride]
++#define TP0 pix[-1 * xstride + 3 * ystride]
++#define TQ0 pix[0 * xstride + 3 * ystride]
++#define TQ1 pix[1 * xstride + 3 * ystride]
++#define TQ2 pix[2 * xstride + 3 * ystride]
++#define TQ3 pix[3 * xstride + 3 * ystride]
++
++static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
++ ptrdiff_t _xstride, ptrdiff_t _ystride,
++ int beta, int *_tc,
++ uint8_t *_no_p, uint8_t *_no_q)
++{
++ int d, j;
++ pixel *pix = (pixel *)_pix;
++ ptrdiff_t xstride = _xstride / sizeof(pixel);
++ ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++ beta <<= BIT_DEPTH - 8;
++
++ for (j = 0; j < 2; j++) {
++ const int dp0 = abs(P2 - 2 * P1 + P0);
++ const int dq0 = abs(Q2 - 2 * Q1 + Q0);
++ const int dp3 = abs(TP2 - 2 * TP1 + TP0);
++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
++ const int d0 = dp0 + dq0;
++ const int d3 = dp3 + dq3;
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ const int no_p = _no_p[j];
++ const int no_q = _no_q[j];
++
++ if (d0 + d3 >= beta) {
++ pix += 4 * ystride;
++ continue;
++ } else {
++ const int beta_3 = beta >> 3;
++ const int beta_2 = beta >> 2;
++ const int tc25 = ((tc * 5 + 1) >> 1);
++
++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
++ // strong filtering
++ const int tc2 = tc << 1;
++ for (d = 0; d < 4; d++) {
++ const int p3 = P3;
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ const int q3 = Q3;
++ if (!no_p) {
++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++ }
++ if (!no_q) {
++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++ }
++ pix += ystride;
++ }
++ } else { // normal filtering
++ int nd_p = 1;
++ int nd_q = 1;
++ const int tc_2 = tc >> 1;
++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++ nd_p = 2;
++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++ nd_q = 2;
++
++ for (d = 0; d < 4; d++) {
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++ if (abs(delta0) < 10 * tc) {
++ delta0 = av_clip(delta0, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ if (!no_p && nd_p > 1) {
++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++ P1 = av_clip_pixel(p1 + deltap1);
++ }
++ if (!no_q && nd_q > 1) {
++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++ Q1 = av_clip_pixel(q1 + deltaq1);
++ }
++ }
++ pix += ystride;
++ }
++ }
++ }
++ }
++}
++
++static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
++ ptrdiff_t _ystride, int *_tc,
++ uint8_t *_no_p, uint8_t *_no_q)
++{
++ int d, j, no_p, no_q;
++ pixel *pix = (pixel *)_pix;
++ ptrdiff_t xstride = _xstride / sizeof(pixel);
++ ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++ for (j = 0; j < 2; j++) {
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ if (tc <= 0) {
++ pix += 4 * ystride;
++ continue;
++ }
++ no_p = _no_p[j];
++ no_q = _no_q[j];
++
++ for (d = 0; d < 4; d++) {
++ int delta0;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ pix += ystride;
++ }
++ }
++}
++
++static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
++}
++
++static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
++ beta, tc, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
++ beta, tc, no_p, no_q);
++}
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++// line zero
++#define P3 pix_l[0 * xstride]
++#define P2 pix_l[1 * xstride]
++#define P1 pix_l[2 * xstride]
++#define P0 pix_l[3 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++#define Q2 pix_r[2 * xstride]
++#define Q3 pix_r[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix_l[0 * xstride + 3 * ystride]
++#define TP2 pix_l[1 * xstride + 3 * ystride]
++#define TP1 pix_l[2 * xstride + 3 * ystride]
++#define TP0 pix_l[3 * xstride + 3 * ystride]
++#define TQ0 pix_r[0 * xstride + 3 * ystride]
++#define TQ1 pix_r[1 * xstride + 3 * ystride]
++#define TQ2 pix_r[2 * xstride + 3 * ystride]
++#define TQ3 pix_r[3 * xstride + 3 * ystride]
++
++// This is identical to hevc_loop_filter_luma except that the P/Q
++// components are on separate pointers
++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++ uint8_t * _pix_l)
++{
++ int d, j;
++ pixel *pix_l = (pixel *)_pix_l;
++ pixel *pix_r = (pixel *)_pix_r;
++ const ptrdiff_t xstride = 1;
++ const ptrdiff_t ystride = _stride / sizeof(pixel);
++
++ beta <<= BIT_DEPTH - 8;
++
++ for (j = 0; j < 2; j++) {
++ const int dp0 = abs(P2 - 2 * P1 + P0);
++ const int dq0 = abs(Q2 - 2 * Q1 + Q0);
++ const int dp3 = abs(TP2 - 2 * TP1 + TP0);
++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
++ const int d0 = dp0 + dq0;
++ const int d3 = dp3 + dq3;
++ const int tc = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
++ const int no_p = no_f & 1;
++ const int no_q = no_f & 2;
++
++ if (d0 + d3 >= beta) {
++ pix_l += 4 * ystride;
++ pix_r += 4 * ystride;
++ continue;
++ } else {
++ const int beta_3 = beta >> 3;
++ const int beta_2 = beta >> 2;
++ const int tc25 = ((tc * 5 + 1) >> 1);
++
++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
++ // strong filtering
++ const int tc2 = tc << 1;
++ for (d = 0; d < 4; d++) {
++ const int p3 = P3;
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ const int q3 = Q3;
++ if (!no_p) {
++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++ }
++ if (!no_q) {
++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++ }
++ pix_l += ystride;
++ pix_r += ystride;
++ }
++ } else { // normal filtering
++ int nd_p = 1;
++ int nd_q = 1;
++ const int tc_2 = tc >> 1;
++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++ nd_p = 2;
++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++ nd_q = 2;
++
++ for (d = 0; d < 4; d++) {
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++ if (abs(delta0) < 10 * tc) {
++ delta0 = av_clip(delta0, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ if (!no_p && nd_p > 1) {
++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++ P1 = av_clip_pixel(p1 + deltap1);
++ }
++ if (!no_q && nd_q > 1) {
++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++ Q1 = av_clip_pixel(q1 + deltaq1);
++ }
++ }
++ pix_l += ystride;
++ pix_r += ystride;
++ }
++ }
++ }
++ }
++}
++
++static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
++{
++ // Just call the non-2 function having massaged the parameters
++ int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
++ uint8_t no_p[2] = {no_f & 1, no_f & 1};
++ uint8_t no_q[2] = {no_f & 2, no_f & 2};
++ FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
++}
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#define P1 pix_l[0 * xstride]
++#define P0 pix_l[1 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++
++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
++ ptrdiff_t _ystride, const int32_t *_tc,
++ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
++{
++ int d, j, no_p, no_q;
++ pixel *pix_l = (pixel *)_pix_l;
++ pixel *pix_r = (pixel *)_pix_r;
++ ptrdiff_t xstride = _xstride / sizeof(pixel);
++ ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++ for (j = 0; j < 2; j++) {
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ if (tc <= 0) {
++ pix_l += 4 * ystride;
++ pix_r += 4 * ystride;
++ continue;
++ }
++ no_p = _no_p[j];
++ no_q = _no_q[j];
++
++ for (d = 0; d < 4; d++) {
++ int delta0;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ pix_l += ystride;
++ pix_r += ystride;
++ }
++ }
++}
++
++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
++ unsigned int no_f)
++{
++ uint8_t no_p[2] = {no_f & 1, no_f & 2};
++ uint8_t no_q[2] = {no_f & 4, no_f & 8};
++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
++ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++ uint8_t * src_l,
++ unsigned int no_f)
++{
++ uint8_t no_p[2] = {no_f & 1, no_f & 2};
++ uint8_t no_q[2] = {no_f & 4, no_f & 8};
++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
++ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
++}
++
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.c
+@@ -0,0 +1,161 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdec.h"
++
++#include "rpi_hevcpred.h"
++#if (ARCH_ARM)
++#include "arm/rpi_hevcpred_arm.h"
++#endif
++
++#define PRED_C 0
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++
++#define PRED_C 1
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef FUNCC
++#define FUNCC(a, depth) a ## _ ## depth ## _c
++
++#define HEVC_PRED_Y(depth) \
++ hpc->intra_pred = FUNC(intra_pred, depth); \
++ hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
++ hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
++ hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
++ hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
++ hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \
++ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \
++ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \
++ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \
++ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \
++ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \
++ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \
++ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \
++ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
++ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
++ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
++ hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \
++ hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \
++ hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \
++ hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth);
++
++#define HEVC_PRED_C(depth) \
++ hpc->intra_pred_c = FUNCC(intra_pred, depth); \
++ hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
++ hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
++ hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
++ hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
++ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \
++ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \
++ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \
++ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \
++ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \
++ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \
++ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \
++ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \
++ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
++ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
++ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
++ hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \
++ hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \
++ hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \
++ hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth);
++
++#define HEVC_PRED(depth) \
++ HEVC_PRED_Y(depth); \
++ HEVC_PRED_C(depth);
++
++ switch (bit_depth) {
++ case 9:
++ HEVC_PRED(9);
++ break;
++ case 10:
++ HEVC_PRED(10);
++ break;
++ case 12:
++ HEVC_PRED(12);
++ break;
++ default:
++ HEVC_PRED(8);
++ break;
++ }
++
++#if (ARCH_ARM)
++ ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
++#elif (ARCH_MIPS)
++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++#endif
++}
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.h
+@@ -0,0 +1,123 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCPRED_H
++#define AVCODEC_RPI_HEVCPRED_H
++
++#include <stddef.h>
++#include <stdint.h>
++#include "config.h"
++
++struct HEVCRpiContext;
++struct HEVCRpiLocalContext;
++
++enum IntraPredMode {
++ INTRA_PLANAR = 0,
++ INTRA_DC,
++ INTRA_ANGULAR_2,
++ INTRA_ANGULAR_3,
++ INTRA_ANGULAR_4,
++ INTRA_ANGULAR_5,
++ INTRA_ANGULAR_6,
++ INTRA_ANGULAR_7,
++ INTRA_ANGULAR_8,
++ INTRA_ANGULAR_9,
++ INTRA_ANGULAR_10,
++ INTRA_ANGULAR_11,
++ INTRA_ANGULAR_12,
++ INTRA_ANGULAR_13,
++ INTRA_ANGULAR_14,
++ INTRA_ANGULAR_15,
++ INTRA_ANGULAR_16,
++ INTRA_ANGULAR_17,
++ INTRA_ANGULAR_18,
++ INTRA_ANGULAR_19,
++ INTRA_ANGULAR_20,
++ INTRA_ANGULAR_21,
++ INTRA_ANGULAR_22,
++ INTRA_ANGULAR_23,
++ INTRA_ANGULAR_24,
++ INTRA_ANGULAR_25,
++ INTRA_ANGULAR_26,
++ INTRA_ANGULAR_27,
++ INTRA_ANGULAR_28,
++ INTRA_ANGULAR_29,
++ INTRA_ANGULAR_30,
++ INTRA_ANGULAR_31,
++ INTRA_ANGULAR_32,
++ INTRA_ANGULAR_33,
++ INTRA_ANGULAR_34,
++};
++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26
++
++typedef void intra_filter_fn_t(
++ uint8_t * const left, uint8_t * const top,
++ const unsigned int req, const unsigned int avail,
++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
++ const unsigned int stride,
++ const unsigned int top_right_size, const unsigned int down_left_size);
++
++typedef struct HEVCRpiPredContext {
++ void (*intra_pred)(const struct HEVCRpiContext * const s,
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
++ const unsigned int avail, const unsigned int log2_size);
++
++ intra_filter_fn_t *intra_filter[4];
++ void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride);
++ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride);
++ void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
++
++ void (*intra_pred_c)(const struct HEVCRpiContext * const s,
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
++ const unsigned int avail, const unsigned int log2_size);
++ intra_filter_fn_t *intra_filter_c[4];
++ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride);
++ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride);
++ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int mode);
++ void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
++} HEVCRpiPredContext;
++
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
++
++#endif /* AVCODEC_RPI_HEVCPRED_H */
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred_template.c
+@@ -0,0 +1,1407 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "bit_depth_template.c"
++
++#include "rpi_hevcdec.h"
++#include "rpi_hevcpred.h"
++
++#define DUMP_PRED 0
++
++#define POS(x, y) src[(x) + stride * (y)]
++
++// INCLUDED_ONCE defined at EOF
++#ifndef INCLUDED_ONCE
++typedef uint8_t (* c8_dst_ptr_t)[2];
++typedef const uint8_t (* c8_src_ptr_t)[2];
++typedef uint16_t (* c16_dst_ptr_t)[2];
++typedef const uint16_t (* c16_src_ptr_t)[2];
++
++// *** On ARM make these NEON registers
++typedef struct pixel4_16 {
++ uint16_t x[4];
++} pixel4_16;
++typedef struct pixel4_32 {
++ uint32_t x[4];
++} pixel4_32;
++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
++{
++ pixel4_16 t = {{x, x, x, x}};
++ return t;
++}
++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
++{
++ pixel4_32 t = {{x, x, x, x}};
++ return t;
++}
++#endif
++
++#if PRED_C
++// For chroma we double pixel size so we copy pairs
++#undef pixel
++#undef pixel2
++#undef pixel4
++#undef dctcoef
++#undef INIT_CLIP
++#undef no_rnd_avg_pixel4
++#undef rnd_avg_pixel4
++#undef AV_RN2P
++#undef AV_RN4P
++#undef AV_RN4PA
++#undef AV_WN2P
++#undef AV_WN4P
++#undef AV_WN4PA
++#undef CLIP
++#undef FUNC
++#undef FUNCC
++#undef av_clip_pixel
++#undef PIXEL_SPLAT_X4
++
++#if BIT_DEPTH == 8
++#define pixel uint16_t
++#define pixel4 pixel4_16
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
++#define cpel uint8_t
++#define c_src_ptr_t c8_src_ptr_t
++#define c_dst_ptr_t c8_dst_ptr_t
++#else
++#define pixel uint32_t
++#define pixel4 pixel4_32
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
++#define cpel uint16_t
++#define c_src_ptr_t c16_dst_ptr_t
++#define c_dst_ptr_t c16_dst_ptr_t
++#endif
++#define AV_RN4P(p) (*(pixel4*)(p))
++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
++#endif
++
++
++// Get PW prior to horrid PRED_C trickery
++#if BIT_DEPTH == 8
++#define PW 1
++#else
++#define PW 2
++#endif
++
++
++#if DUMP_PRED && !defined(INCLUDED_ONCE)
++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
++{
++ for (unsigned int y = 0; y != size; y++, data += stride * 2) {
++ for (unsigned int x = 0; x != size; x++) {
++ printf("%4d", data[x * 2]);
++ }
++ printf("\n");
++ }
++ printf("\n");
++}
++#endif
++
++#ifndef INCLUDED_ONCE
++static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
++{
++ if ((n >>= 2) != 0) {
++ uint32_t v4 = v | (v << 8);
++ uint32_t * p = (uint32_t *)ptr;
++ v4 = v4 | (v4 << 16);
++ do {
++ *p++ = v4;
++ } while (--n != 0);
++ }
++}
++
++static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
++{
++ if ((n >>= 2) != 0) {
++ uint32_t v2 = v | (v << 16);
++ uint32_t * p = (uint32_t *)ptr;
++ do {
++ *p++ = v2;
++ *p++ = v2;
++ } while (--n != 0);
++ }
++}
++
++static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
++{
++ if ((n >>= 2) != 0) {
++ uint32_t * p = (uint32_t *)ptr;
++ do {
++ *p++ = v;
++ *p++ = v;
++ *p++ = v;
++ *p++ = v;
++ } while (--n != 0);
++ }
++}
++
++// Beware that this inverts the avail ordering
++// For CIP it seems easier this way round
++static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++ unsigned int s0, unsigned int odd_s)
++{
++ const unsigned int n = 1 << log2_intra_bits;
++ unsigned int fa = 0;
++ unsigned int i;
++
++ size >>= 2; // Now in 4-pel units
++ s0 >>= 2;
++
++ if ((avail & AVAIL_DL) != 0)
++ fa |= ((1 << s0) - 1) << (size - s0);
++ if ((avail & AVAIL_L) != 0)
++ fa |= ((1 << size) - 1) << size;
++ if ((avail & AVAIL_UL) != 0)
++ fa |= 1 << (size << 1);
++
++ if (odd_s) {
++ if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
++ fa &= ~1;
++ is_intra += i_stride;
++ }
++
++ for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
++ const unsigned int m = ((1 << n) - 1) << i;
++ if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
++ fa &= ~m;
++ }
++
++ return fa;
++}
++
++static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++ unsigned int s1, unsigned int odd_s)
++{
++ if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
++ {
++ return 0;
++ }
++ else
++ {
++ const unsigned int n = 1 << log2_intra_bits;
++ unsigned int fa = 0;
++ unsigned int i;
++ unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
++
++ size >>= 2; // Now in 4-pel units
++ s1 >>= 2;
++
++ if ((avail & AVAIL_U) != 0)
++ fa |= ((1 << size) - 1);
++ if ((avail & AVAIL_UR) != 0)
++ fa |= ((1 << s1) - 1) << size;
++
++ if (odd_s) {
++ fa &= im | ~1;
++ im >>= 1;
++ }
++
++ for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
++ const unsigned int m = ((1 << n) - 1) << i;
++ if ((im & 1) == 0)
++ fa &= ~m;
++ }
++ return fa;
++ }
++}
++
++
++
++static inline unsigned int rmbd(unsigned int x)
++{
++#if 1
++ return __builtin_ctz(x);
++#else
++ unsigned int n = 0;
++ if ((x & 0xffff) == 0) {
++ x >>= 16;
++ n += 16;
++ }
++ if ((x & 0xff) == 0) {
++ x >>= 8;
++ n += 8;
++ }
++ if ((x & 0xf) == 0) {
++ x >>= 4;
++ n += 4;
++ }
++ if ((x & 0x3) == 0) {
++ x >>= 2;
++ n += 2;
++ }
++
++ return (x & 1) == 0 ? n + 1 : n;
++#endif
++}
++#endif
++
++
++static void FUNC(cip_fill)(pixel * const left, pixel * const top,
++ const unsigned int avail_l, const unsigned int avail_u,
++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++ const unsigned int stride,
++ const unsigned int size)
++{
++ pixel a;
++ unsigned int i;
++
++ // 1st find DL value
++ if ((avail_l & 1) == 0) {
++ if (avail_l != 0)
++ a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
++ else
++ {
++ // (avail_l | avail_u) != 0 so this must be good
++ const unsigned int n = rmbd(avail_u)*4;
++ a = (n >= size) ? src_ur[n - size] : src_u[n];
++ }
++ }
++
++ // L
++ {
++ pixel * d = left + size * 2 - 1;
++ const pixel * s = src_l + (size * 2 - 1) * stride;
++ unsigned int x = avail_l;
++ for (i = 0; i < size * 2; i += 4, x >>= 1)
++ {
++ if ((x & 1) != 0) {
++ // Avail
++ *d-- = *s;
++ s -= stride;
++ *d-- = *s;
++ s -= stride;
++ *d-- = *s;
++ s -= stride;
++ *d-- = a = *s;
++ s -= stride;
++ }
++ else
++ {
++ *d-- = a;
++ *d-- = a;
++ *d-- = a;
++ *d-- = a;
++ s -= stride * 4;
++ }
++ }
++ // UL
++ *d = a = (x & 1) != 0 ? *s : a;
++ }
++
++ // U
++ {
++ pixel * d = top;
++ const pixel * s = src_u;
++ unsigned int x = avail_u;
++
++ for (i = 0; i < size; i += 4, x >>= 1)
++ {
++ if ((x & 1) != 0) {
++ // Avail
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = a = *s++;
++ }
++ else
++ {
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ s += 4;
++ }
++ }
++
++ // UR
++ s = src_ur;
++ for (i = 0; i < size; i += 4, x >>= 1)
++ {
++ if ((x & 1) != 0) {
++ // Avail
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = *s++;
++ *d++ = a = *s++;
++ }
++ else
++ {
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ *d++ = a;
++ s += 4;
++ }
++ }
++ }
++}
++
++
++#if !PRED_C && PW == 1
++#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
++#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
++#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
++#else
++#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
++#endif
++
++// Reqs:
++//
++// Planar: DL[0], L, ul, U, UR[0]
++// DC: dl, L, ul, U, ur
++// A2-9: DL, L, ul, u, ur
++// A10: dl, L, ul, u, ur
++// A11-17 dl, L, UL, U, ur
++// A18-25 dl, L, Ul, U, ur
++// A26 dl, l, ul, U, ur
++// A27-34 dl, l, ul, U, UR
++
++#ifndef INCLUDED_ONCE
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++
++static const uint8_t req_avail_c[35] =
++{
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L, // 2
++ AVAIL_DL | AVAIL_L, // 3
++ AVAIL_DL | AVAIL_L, // 4
++ AVAIL_DL | AVAIL_L, // 5
++ AVAIL_DL | AVAIL_L, // 6
++ AVAIL_DL | AVAIL_L, // 7
++ AVAIL_DL | AVAIL_L, // 8
++ AVAIL_DL | AVAIL_L, // 9
++ AVAIL_L, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25
++ AVAIL_U, // 26 (V)
++ AVAIL_U | AVAIL_UR, // 27
++ AVAIL_U | AVAIL_UR, // 28
++ AVAIL_U | AVAIL_UR, // 29
++ AVAIL_U | AVAIL_UR, // 30
++ AVAIL_U | AVAIL_UR, // 31
++ AVAIL_U | AVAIL_UR, // 32
++ AVAIL_U | AVAIL_UR, // 33
++ AVAIL_U | AVAIL_UR // 34
++};
++
++static const uint8_t req_avail[4][35] = {
++{
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L, // 2
++ AVAIL_DL | AVAIL_L, // 3
++ AVAIL_DL | AVAIL_L, // 4
++ AVAIL_DL | AVAIL_L, // 5
++ AVAIL_DL | AVAIL_L, // 6
++ AVAIL_DL | AVAIL_L, // 7
++ AVAIL_DL | AVAIL_L, // 8
++ AVAIL_DL | AVAIL_L, // 9
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25
++ AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V)
++ AVAIL_U | AVAIL_UR, // 27
++ AVAIL_U | AVAIL_UR, // 28
++ AVAIL_U | AVAIL_UR, // 29
++ AVAIL_U | AVAIL_UR, // 30
++ AVAIL_U | AVAIL_UR, // 31
++ AVAIL_U | AVAIL_UR, // 32
++ AVAIL_U | AVAIL_UR, // 33
++ AVAIL_U | AVAIL_UR // 34
++},
++{ // 3
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2
++ AVAIL_DL | AVAIL_L | 0, // 3
++ AVAIL_DL | AVAIL_L | 0, // 4
++ AVAIL_DL | AVAIL_L | 0, // 5
++ AVAIL_DL | AVAIL_L | 0, // 6
++ AVAIL_DL | AVAIL_L | 0, // 7
++ AVAIL_DL | AVAIL_L | 0, // 8
++ AVAIL_DL | AVAIL_L | 0, // 9
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V)
++ AVAIL_U | AVAIL_UR | 0, // 27
++ AVAIL_U | AVAIL_UR | 0, // 28
++ AVAIL_U | AVAIL_UR | 0, // 29
++ AVAIL_U | AVAIL_UR | 0, // 30
++ AVAIL_U | AVAIL_UR | 0, // 31
++ AVAIL_U | AVAIL_UR | 0, // 32
++ AVAIL_U | AVAIL_UR | 0, // 33
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34
++},
++{ // 4
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7
++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8
++ AVAIL_DL | AVAIL_L | 0, // 9
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25
++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V)
++ AVAIL_U | AVAIL_UR | 0, // 27
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33
++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34
++},
++{ // 5
++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
++ AVAIL_L | 0 | AVAIL_U, // DC
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8
++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9
++ AVAIL_L | 0, // 10 (H)
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24
++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25
++ AVAIL_U | 0, // 26 (V)
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
++ AVAIL_U | AVAIL_UR | FILTER_EITHER // 34
++}
++};
++
++
++#endif
++
++#define filter_light1 FUNC(filter_light1)
++static inline pixel filter_light1(pixel a, pixel b, pixel c)
++{
++ return (a + b*2 + c + 2) >> 2;
++}
++
++#define filter_light FUNC(filter_light)
++static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
++{
++ pixel p0;
++ pixel p2 = *src;
++ // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
++ unsigned int n_minus_1 = n - 1;
++
++ do
++ {
++ src += sstride;
++ p0 = p1;
++ p1 = p2;
++ p2 = *src;
++ *dst++ = filter_light1(p0, p1, p2);
++ } while (--n_minus_1 != 0);
++ *dst = filter_light1(p1, p2, pn);
++}
++
++#define filter_strong FUNC(filter_strong)
++static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
++{
++ unsigned int a = 64 * p0 + 32;
++ const int v = p1 - p0;
++
++ do
++ {
++ *dst++ = (a += v) >> 6;
++ } while (--n != 0);
++}
++
++#define intra_filter FUNC(intra_filter)
++static av_always_inline void intra_filter(
++ pixel * const left, pixel * const top,
++ const unsigned int req, const unsigned int avail,
++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++ const unsigned int stride,
++ const unsigned int top_right_size, const unsigned int down_left_size,
++ const unsigned int log2_size)
++{
++ const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
++ const unsigned int size = 1 << log2_size;
++
++ // a_ is the first pel in a section working round dl -> ur
++ // b_ is the last
++ // Beware that top & left work out from UL so usage of a_ & b_ may
++ // swap between them. It is a bad naming scheme but I have found no
++ // better
++ const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
++ const pixel * b_dl = src_l + size * stride;
++ const pixel * a_l = src_l + (size - 1) * stride;
++ const pixel * b_l = src_l;
++ const pixel * ab_ul = src_l - stride;
++ const pixel * a_u = src_u;
++ const pixel * b_u = src_u + size - 1;
++ const pixel * a_ur = src_ur;
++ const pixel * b_ur = src_ur + top_right_size - 1;
++
++ const unsigned int want = req & ~avail;
++ const unsigned int have = req & avail;
++ unsigned int i;
++
++ if ((avail & AVAIL_DL) == 0)
++ {
++ a_dl = a_ur;
++ if ((avail & AVAIL_U) != 0)
++ a_dl = a_u;
++ if ((avail & AVAIL_UL) != 0)
++ a_dl = ab_ul;
++ if ((avail & AVAIL_L) != 0)
++ a_dl = a_l;
++ b_dl = a_dl;
++ }
++
++ if ((avail & AVAIL_L) == 0)
++ {
++ a_l = b_dl;
++ b_l = b_dl;
++ }
++ if ((avail & AVAIL_UL) == 0)
++ {
++ ab_ul = b_l;
++ }
++ if ((avail & AVAIL_U) == 0)
++ {
++ a_u = ab_ul;
++ b_u = ab_ul;
++ }
++ if ((avail & AVAIL_UR) == 0)
++ {
++ a_ur = b_u;
++ b_ur = b_u;
++ }
++
++ if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints
++ {
++ if ((req & AVAIL_UL) != 0)
++ left[-1] = *ab_ul;
++
++ if ((want & AVAIL_L) != 0)
++ EXTEND(left, *a_l, size);
++ if ((want & AVAIL_DL) != 0)
++ EXTEND(left + size, *a_dl, size);
++ if ((want & AVAIL_U) != 0)
++ EXTEND(top, *a_u, size);
++ if ((want & AVAIL_UR) != 0)
++ EXTEND(top + size, *a_ur, size);
++
++ if ((have & AVAIL_U) != 0)
++ // Always good - even with sand
++ memcpy(top, a_u, size * sizeof(pixel));
++ if ((have & AVAIL_UR) != 0)
++ {
++ memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
++ EXTEND(top + size + top_right_size, *b_ur,
++ size - top_right_size);
++ }
++ if ((have & AVAIL_L) != 0)
++ {
++ for (i = 0; i < size; i++)
++ left[i] = b_l[stride * i];
++ }
++ if ((have & AVAIL_DL) != 0)
++ {
++ for (i = 0; i < down_left_size; i++)
++ left[i + size] = b_dl[stride * i];
++ EXTEND(left + size + down_left_size, *a_dl,
++ size - down_left_size);
++ }
++ }
++ else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
++ FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
++ FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
++ {
++ if ((req & (AVAIL_U | AVAIL_UR)) != 0)
++ filter_strong(top, *ab_ul, *b_ur, size * 2);
++ left[-1] = *ab_ul;
++ if ((req & (AVAIL_L | AVAIL_DL)) != 0)
++ filter_strong(left, *ab_ul, *a_dl, size*2);
++ }
++ else
++ {
++ // Same code for both have & want for UL
++ if ((req & AVAIL_UL) != 0)
++ {
++ left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
++ }
++
++ if ((want & AVAIL_L) != 0)
++ {
++ EXTEND(left, *a_l, size);
++ left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
++ }
++ if ((want & AVAIL_DL) != 0)
++ {
++ // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
++ EXTEND(left + size, *a_l, size);
++ }
++ if ((want & AVAIL_U) != 0)
++ {
++ EXTEND(top, *a_u, size);
++ top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
++ }
++ if ((want & AVAIL_UR) != 0)
++ {
++ // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
++ EXTEND(top + size, *a_ur, size);
++ }
++
++ if ((have & AVAIL_U) != 0)
++ {
++ filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
++ }
++ if ((have & AVAIL_UR) != 0) {
++ filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
++ top[size*2 - 1] = *b_ur;
++ EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
++ }
++ if ((have & AVAIL_L) != 0)
++ {
++ filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
++ }
++ if ((have & AVAIL_DL) != 0)
++ {
++ filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
++ left[size*2 - 1] = *a_dl;
++ EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
++ }
++ }
++}
++
++#define INTRA_FILTER(log2_size) \
++static void FUNC(intra_filter_ ## log2_size)( \
++ uint8_t * const left, uint8_t * const top, \
++ const unsigned int req, const unsigned int avail, \
++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
++ const unsigned int stride, \
++ const unsigned int top_right_size, const unsigned int down_left_size) \
++{ \
++ intra_filter((pixel *)left, (pixel *)top, req, avail, \
++ (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
++}
++
++INTRA_FILTER(2)
++INTRA_FILTER(3)
++INTRA_FILTER(4)
++INTRA_FILTER(5)
++
++#undef intra_filter
++#undef INTRA_FILTER
++
++static void FUNC(intra_pred)(const HEVCRpiContext * const s,
++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
++ const unsigned int log2_size)
++{
++ // c_idx will alaways be 1 for _c versions and 0 for y
++ const unsigned int c_idx = PRED_C;
++ const unsigned int hshift = ctx_hshift(s, c_idx);
++ const unsigned int vshift = ctx_vshift(s, c_idx);
++ const unsigned int size = (1 << log2_size);
++ const unsigned int x = x0 >> hshift;
++ const unsigned int y = y0 >> vshift;
++
++ const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
++ pixel *const src = c_idx == 0 ?
++ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
++ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
++
++ // Align so we can do multiple loads in the asm
++ // Padded to 16 byte boundary so as not to confuse anything
++ DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]);
++ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
++
++ pixel * const left = left_array + 16 / sizeof(pixel);
++ const pixel * top_pred = top;
++
++ const pixel * src_l = src - 1;
++ const pixel * src_u = src - stride;
++ const pixel * src_ur = src_u + size;
++#if !PRED_C
++ const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable;
++#else
++ const unsigned int req = req_avail_c[mode];
++#endif
++
++ // If we have nothing to pred from then fill with grey
++ // This isn't a common case but dealing with it here means we don't have to
++ // test for it later
++ if (avail == 0)
++ {
++dc_only:
++#if !PRED_C
++ s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
++#else
++ s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
++#endif
++ return;
++ }
++
++ {
++ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
++ const AVFrame * const frame = s->frame;
++ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
++ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
++ if ((x & mask) == 0)
++ src_l -= stripe_adj;
++ if (((x + size) & mask) == 0)
++ src_ur += stripe_adj;
++ }
++
++ // Can deal with I-slices in 'normal' code even if CIP
++ // This also means that we don't need to generate (elsewhere) is_intra
++ // for IRAP frames
++ if (s->ps.pps->constrained_intra_pred_flag == 1 &&
++ s->sh.slice_type != HEVC_SLICE_I)
++ {
++ // * If we ever actually care about CIP performance then we should
++ // special case out size 4 stuff (can be done by 'normal') and
++ // have 8-pel avail masks
++ unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
++ -(int)(s->ps.sps->pcm_width),
++ 1 << (((x - 1) >> (3 - hshift)) & 7),
++ 1 - hshift,
++ avail,
++ size,
++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
++ vshift != 0 ? 0 : (y >> 2) & 1);
++
++ unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
++ (x >> (3 - hshift)) & 7,
++ 1 - hshift,
++ avail,
++ size,
++ FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
++ hshift != 0 ? 0 : (x >> 2) & 1);
++
++ // Anything left?
++ if ((avail_l | avail_u) == 0)
++ goto dc_only;
++
++ FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
++
++#if !PRED_C
++ if ((req & FILTER_LIGHT) != 0)
++ {
++ const unsigned threshold = 1 << (BIT_DEPTH - 5);
++ if ((req & FILTER_STRONG) != 0 &&
++ (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold &&
++ (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
++ {
++ filter_strong(top, left[-1], top[63], 64);
++ filter_strong(left, left[-1], left[63], 64);
++ } else
++ {
++ // LHS writes UL too so copy for top
++ const pixel p_ul = left[-1];
++ filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
++ filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
++ }
++ }
++#endif
++ }
++ else
++ {
++ const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
++ if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
++ ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
++ {
++ top_pred = src_u;
++ }
++ else
++ {
++#if !PRED_C
++ s->hpc.intra_filter[log2_size - 2]
++#else
++ s->hpc.intra_filter_c[log2_size - 2]
++#endif
++ ((uint8_t *)left, (uint8_t *)top, req, avail,
++ (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
++ ur_size,
++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
++ }
++ }
++
++
++#if !PRED_C
++ switch (mode) {
++ case INTRA_PLANAR:
++ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_DC:
++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_ANGULAR_HORIZONTAL:
++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ case INTRA_ANGULAR_VERTICAL:
++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ default:
++ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ }
++#else
++ switch (mode) {
++ case INTRA_PLANAR:
++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_DC:
++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_ANGULAR_HORIZONTAL:
++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ case INTRA_ANGULAR_VERTICAL:
++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ default:
++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++ (uint8_t *)left, stride,
++ mode);
++ break;
++ }
++
++#if DUMP_PRED
++ printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
++ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
++ printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
++ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
++#endif
++#endif
++}
++
++#if !PRED_C
++static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++ const uint8_t *_left, ptrdiff_t stride,
++ int trafo_size)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ const pixel *top = (const pixel *)_top;
++ const pixel *left = (const pixel *)_left;
++ int size = 1 << trafo_size;
++ for (y = 0; y < size; y++)
++ for (x = 0; x < size; x++)
++ POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] +
++ (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1);
++}
++#else
++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
++ const uint8_t * _left, ptrdiff_t stride,
++ int trafo_size)
++{
++ int x, y;
++ int size = 1 << trafo_size;
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ const c_src_ptr_t top = (c_src_ptr_t)_top;
++ const c_src_ptr_t left = (c_src_ptr_t)_left;
++
++ for (y = 0; y < size; y++, src += stride)
++ {
++ for (x = 0; x < size; x++)
++ {
++ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] +
++ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
++ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] +
++ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
++ }
++ }
++}
++#endif
++
++#define PRED_PLANAR(size)\
++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \
++ const uint8_t *left, ptrdiff_t stride) \
++{ \
++ FUNC(pred_planar)(src, top, left, stride, size + 2); \
++}
++
++PRED_PLANAR(0)
++PRED_PLANAR(1)
++PRED_PLANAR(2)
++PRED_PLANAR(3)
++
++#undef PRED_PLANAR
++
++#if !PRED_C
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride, int log2_size)
++{
++ int i, j, x, y;
++ int size = (1 << log2_size);
++ pixel *src = (pixel *)_src;
++ const pixel *top = (const pixel *)_top;
++ const pixel *left = (const pixel *)_left;
++ int dc = size;
++ pixel4 a;
++ for (i = 0; i < size; i++)
++ dc += left[i] + top[i];
++
++ dc >>= log2_size + 1;
++
++ a = PIXEL_SPLAT_X4(dc);
++
++ for (i = 0; i < size; i++)
++ for (j = 0; j < size; j+=4)
++ AV_WN4P(&POS(j, i), a);
++
++// if (c_idx == 0 && size < 32)
++// As we now have separate fns for y & c - no need to test that
++ if (size < 32)
++ {
++ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
++ for (x = 1; x < size; x++)
++ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
++ for (y = 1; y < size; y++)
++ POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++ }
++}
++#else
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride, int log2_size)
++{
++ unsigned int i, j;
++ const unsigned int size = (1 << log2_size);
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ const c_src_ptr_t top = (c_src_ptr_t)_top;
++ const c_src_ptr_t left = (c_src_ptr_t)_left;
++ unsigned int dc0 = size;
++ unsigned int dc1 = size;
++
++ for (i = 0; i < size; i++)
++ {
++ dc0 += left[i][0] + top[i][0];
++ dc1 += left[i][1] + top[i][1];
++ }
++
++ dc0 >>= log2_size + 1;
++ dc1 >>= log2_size + 1;
++
++ for (i = 0; i < size; i++, src += stride)
++ {
++ for (j = 0; j < size; ++j)
++ {
++ src[j][0] = dc0;
++ src[j][1] = dc1;
++
++ }
++ }
++}
++#endif
++
++#define PRED_DC(size)\
++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \
++ const uint8_t *left, ptrdiff_t stride) \
++{ \
++ FUNC(pred_dc)(src, top, left, stride, size + 2); \
++}
++
++PRED_DC(0)
++PRED_DC(1)
++PRED_DC(2)
++PRED_DC(3)
++
++#undef PRED_DC
++
++
++
++
++#if !PRED_C
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++ int i, j;
++ int size = (1 << log2_size);
++ pixel *src = (pixel *)_src;
++ pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
++
++ for (i = 0; i < size; i++)
++ for (j = 0; j < size; j+=4)
++ AV_WN4P(&POS(j, i), a);
++}
++#else
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++ unsigned int i, j;
++ const unsigned int size = (1 << log2_size);
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ const pixel a = (1 << (BIT_DEPTH - 1));
++
++ for (i = 0; i < size; i++, src += stride)
++ {
++ for (j = 0; j < size; ++j)
++ {
++ src[j][0] = a;
++ src[j][1] = a;
++ }
++ }
++}
++#endif
++
++#define PRED_DC0(size)\
++static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \
++{ \
++ FUNC(pred_dc0)(src, stride, size + 2); \
++}
++
++PRED_DC0(0)
++PRED_DC0(1)
++PRED_DC0(2)
++PRED_DC0(3)
++
++#undef PRED_DC0
++
++
++
++
++#ifndef ANGLE_CONSTS
++#define ANGLE_CONSTS
++static const int intra_pred_angle[] = {
++ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
++};
++static const int inv_angle[] = {
++ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++ -630, -910, -1638, -4096
++};
++#endif
++
++#if !PRED_C
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++ const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride,
++ int mode, int size)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ const pixel *top = (const pixel *)_top;
++ const pixel *left = (const pixel *)_left;
++
++ int angle = intra_pred_angle[mode - 2];
++ pixel ref_array[3 * MAX_TB_SIZE + 4];
++ pixel *ref_tmp = ref_array + size;
++ const pixel *ref;
++ int last = (size * angle) >> 5;
++
++ if (mode >= 18) {
++ ref = top - 1;
++
++ if (angle < 0)
++ {
++ memcpy(ref_tmp + 1, top, size * PW);
++ ref_tmp[0] = left[-1];
++
++ for (x = last; x <= -1; x++)
++ ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++ ref = ref_tmp;
++ }
++
++ for (y = 0; y < size; y++) {
++ int idx = ((y + 1) * angle) >> 5;
++ int fact = ((y + 1) * angle) & 31;
++ if (fact) {
++ for (x = 0; x < size; x += 4) {
++ POS(x , y) = ((32 - fact) * ref[x + idx + 1] +
++ fact * ref[x + idx + 2] + 16) >> 5;
++ POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
++ fact * ref[x + 1 + idx + 2] + 16) >> 5;
++ POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
++ fact * ref[x + 2 + idx + 2] + 16) >> 5;
++ POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
++ fact * ref[x + 3 + idx + 2] + 16) >> 5;
++ }
++ } else {
++ for (x = 0; x < size; x += 4)
++ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
++ }
++ }
++ if (mode == 26 && size < 32) {
++ for (y = 0; y < size; y++)
++ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
++ }
++
++ } else {
++ ref = left - 1;
++ if (angle < 0 && last < -1) {
++ for (x = 0; x <= size; x += 4)
++ AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
++ // Inv angle <= -256 so top offset >= 0
++ for (x = last; x <= -1; x++)
++ ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++ ref = ref_tmp;
++ }
++
++ for (x = 0; x < size; x++) {
++ int idx = ((x + 1) * angle) >> 5;
++ int fact = ((x + 1) * angle) & 31;
++ if (fact) {
++ for (y = 0; y < size; y++) {
++ POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
++ fact * ref[y + idx + 2] + 16) >> 5;
++ }
++ } else {
++ for (y = 0; y < size; y++)
++ POS(x, y) = ref[y + idx + 1];
++ }
++ }
++ if (mode == 10 && size < 32) {
++ for (x = 0; x < size; x += 4) {
++ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - left[-1]) >> 1));
++ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1));
++ POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1));
++ POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1));
++ }
++ }
++ }
++}
++#else
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++ const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride,
++ int mode, int size)
++{
++ int x, y;
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ c_src_ptr_t top = (c_src_ptr_t)_top;
++ c_src_ptr_t left = (c_src_ptr_t)_left;
++
++ const int angle = intra_pred_angle[mode - 2];
++ cpel ref_array[3 * MAX_TB_SIZE + 4][2];
++ c_dst_ptr_t ref_tmp = ref_array + size;
++ c_src_ptr_t ref;
++ const int last = (size * angle) >> 5;
++
++ if (mode >= 18) {
++ ref = top - 1;
++ if (angle < 0) {
++ memcpy(ref_tmp + 1, top, size * 2 * PW);
++ ref_tmp[0][0] = left[-1][0];
++ ref_tmp[0][1] = left[-1][1];
++ for (x = last; x <= -1; x++)
++ {
++ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++ }
++ ref = (c_src_ptr_t)ref_tmp;
++ }
++
++ for (y = 0; y < size; y++, src += stride) {
++ const int idx = ((y + 1) * angle) >> 5;
++ const int fact = ((y + 1) * angle) & 31;
++ if (fact) {
++ for (x = 0; x < size; ++x) {
++ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
++ fact * ref[x + idx + 2][0] + 16) >> 5;
++ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
++ fact * ref[x + idx + 2][1] + 16) >> 5;
++ }
++ } else {
++ memcpy(src, ref + idx + 1, size * 2 * PW);
++ }
++ }
++ } else {
++ ref = left - 1;
++ if (angle < 0 && last < -1) {
++ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
++ for (x = last; x <= -1; x++)
++ {
++ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++ }
++ ref = (c_src_ptr_t)ref_tmp;
++ }
++
++ for (x = 0; x < size; x++, src++) {
++ const int idx = ((x + 1) * angle) >> 5;
++ const int fact = ((x + 1) * angle) & 31;
++ if (fact) {
++ for (y = 0; y < size; y++) {
++ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
++ fact * ref[y + idx + 2][0] + 16) >> 5;
++ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
++ fact * ref[y + idx + 2][1] + 16) >> 5;
++ }
++ } else {
++ for (y = 0; y < size; y++)
++ {
++ src[y * stride][0] = ref[y + idx + 1][0];
++ src[y * stride][1] = ref[y + idx + 1][1];
++ }
++ }
++ }
++ }
++}
++#endif
++
++static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
++}
++
++static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
++}
++
++static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
++}
++
++static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
++}
++
++#undef cpel
++#undef c_src_ptr_t
++#undef c_dst_ptr_t
++
++#undef EXTEND
++#undef POS
++#undef PW
++
++#undef filter_light1
++#undef filter_light
++#undef filter_strong
++#undef ref_gen
++
++#ifndef INCLUDED_ONCE
++#define INCLUDED_ONCE
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.c
+@@ -0,0 +1,155 @@
++/*
++Copyright (c) 2012, Broadcom Europe Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <stdio.h>
++#include <string.h>
++#include <stdlib.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <stdint.h>
++#include <sys/ioctl.h>
++
++#include <linux/ioctl.h>
++
++#define MAJOR_NUM 100
++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
++#define DEVICE_FILE_NAME "/dev/vcio"
++
++#include "rpi_mailbox.h"
++//#include <interface/vctypes/vc_image_structs.h>
++
++/*
++ * use ioctl to send mbox property message
++ */
++
++static int mbox_property(int file_desc, void *buf)
++{
++ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
++
++ if (ret_val < 0) {
++ printf("ioctl_set_msg failed:%d\n", ret_val);
++ }
++
++#ifdef DEBUG
++ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
++ for (i=0; i<size/4; i++)
++ printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
++#endif
++ return ret_val;
++}
++
++#define GET_VCIMAGE_PARAMS 0x30044
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
++{
++ uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++ uint32_t * p = buf;
++ void * rimg;
++ int rv;
++
++ *p++ = 0; // size
++ *p++ = 0; // process request
++ *p++ = GET_VCIMAGE_PARAMS;
++ *p++ = sizeof(*img);
++ *p++ = sizeof(*img);
++ rimg = p;
++ memcpy(p, img, sizeof(*img));
++ p += sizeof(*img) / sizeof(*p);
++ *p++ = 0; // End tag
++ buf[0] = (p - buf) * sizeof(*p);
++
++ rv = mbox_property(fd, buf);
++ memcpy(img, rimg, sizeof(*img));
++
++ return rv;
++}
++
++
++#define SET_CLOCK_RATE 0x00038002
++#define GET_MAX_CLOCK 0x00030004
++#define CLOCK_HEVC 11
++
++static int mbox_property_generic(int fd, unsigned command, unsigned *word0, unsigned *word1)
++{
++ uint32_t buf[32];
++ uint32_t * p = buf;
++ int rv;
++
++ *p++ = 0; // size
++ *p++ = 0; // process request
++ *p++ = command;
++ *p++ = 8;
++ *p++ = 8;
++ *p++ = *word0;
++ *p++ = *word1;
++ *p++ = 0; // End tag
++ buf[0] = (p - buf) * sizeof(*p);
++
++ rv = mbox_property(fd, buf);
++ *word0 = buf[6];
++ *word1 = buf[7];
++ return rv;
++}
++
++int mbox_open() {
++ int file_desc;
++
++ // open a char device file used for communicating with kernel mbox driver
++ file_desc = open(DEVICE_FILE_NAME, 0);
++ if (file_desc < 0) {
++ printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
++ printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
++ }
++ return file_desc;
++}
++
++void mbox_close(int file_desc) {
++ close(file_desc);
++}
++
++int mbox_request_clock(int fd) {
++ int rv;
++ unsigned word0, word1 = 0;
++ word0 = CLOCK_HEVC;
++ rv = mbox_property_generic(fd, GET_MAX_CLOCK, &word0, &word1);
++ if (rv != 0)
++ return rv;
++ word1 = word0;
++ word0 = CLOCK_HEVC;
++ rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
++ return rv;
++}
++
++int mbox_release_clock(int fd) {
++ int rv;
++ unsigned word0, word1 = 0;
++ word0 = CLOCK_HEVC;
++ word1 = 0;
++ rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
++ return rv;
++}
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.h
+@@ -0,0 +1,58 @@
++#ifndef RPI_MAILBOX_H
++#define RPI_MAILBOX_H
++
++/* The image structure. */
++typedef struct vc_image_extra_uv_s {
++ void *u, *v;
++ int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
++
++typedef union {
++ VC_IMAGE_EXTRA_UV_T uv;
++// VC_IMAGE_EXTRA_RGBA_T rgba;
++// VC_IMAGE_EXTRA_PAL_T pal;
++// VC_IMAGE_EXTRA_TF_T tf;
++// VC_IMAGE_EXTRA_BAYER_T bayer;
++// VC_IMAGE_EXTRA_MSBAYER_T msbayer;
++// VC_IMAGE_EXTRA_CODEC_T codec;
++// VC_IMAGE_EXTRA_OPENGL_T opengl;
++} VC_IMAGE_EXTRA_T;
++
++
++typedef struct VC_IMAGE_T {
++ unsigned short type; /* should restrict to 16 bits */
++ unsigned short info; /* format-specific info; zero for VC02 behaviour */
++ unsigned short width; /* width in pixels */
++ unsigned short height; /* height in pixels */
++ int pitch; /* pitch of image_data array in bytes */
++ int size; /* number of bytes available in image_data array */
++ void *image_data; /* pixel data */
++ VC_IMAGE_EXTRA_T extra; /* extra data like palette pointer */
++ void *metadata; /* metadata header for the image */
++ void *pool_object; /* nonNULL if image was allocated from a vc_pool */
++ int mem_handle; /* the mem handle for relocatable memory storage */
++ int metadata_size; /* size of metadata of each channel in bytes */
++ int channel_offset; /* offset of consecutive channels in bytes */
++ uint32_t video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++ uint8_t num_channels; /* number of channels (2 for stereo) */
++ uint8_t current_channel;/* the channel this header is currently pointing to */
++ uint8_t linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++ uint8_t is_channel_linked; /* Track if the above structure is been used to link the header
++ into a linked-mulitchannel image */
++ uint8_t channel_index; /* index of the channel this header represents while
++ it is being linked. */
++ uint8_t _dummy[3]; /* pad struct to 64 bytes */
++} VC_IMAGE_T;
++
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
++
++
++extern int mbox_open(void);
++extern void mbox_close(int file_desc);
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
++
++int mbox_request_clock(int fd);
++int mbox_release_clock(int fd);
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_mem.c
+@@ -0,0 +1,326 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++
++#include "config.h"
++
++#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <bcm_host.h>
++#include <interface/vctypes/vc_image_types.h>
++#include <interface/vcsm/user-vcsm.h>
++#pragma GCC diagnostic pop
++
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++
++
++#define OPT_PREFER_CMA 0
++
++struct rpi_cache_flush_env_s {
++ struct vcsm_user_clean_invalid2_s v;
++};
++
++
++// GPU memory alloc fns (internal)
++
++static void gpu_free_internal(GPU_MEM_PTR_T * const p)
++{
++ if (p->arm != NULL)
++ vcsm_unlock_ptr(p->arm);
++ if (p->vcsm_handle != 0)
++ vcsm_free(p->vcsm_handle);
++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again
++}
++
++
++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
++ const int numbytes, const unsigned int cache_type, const char * const name)
++{
++ memset(p, 0, sizeof(*p));
++ p->numbytes = (numbytes + 255) & ~255; // Round up
++
++ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name);
++ goto fail;
++ }
++ if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name);
++ goto fail;
++ }
++ if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name);
++ goto fail;
++ }
++ if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name);
++ goto fail;
++ }
++
++ return 0;
++
++fail:
++ gpu_free_internal(p);
++ return AVERROR(ENOMEM);
++}
++
++// Public gpu fns
++
++// Allocate memory on GPU
++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
++// Returns 0 on success.
++// This allocates memory that will not be cached in ARM's data cache.
++// Therefore safe to use without data cache flushing.
++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
++{
++ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
++}
++
++// This allocates data that will be
++// Cached in ARM L2
++// Uncached in VPU L2
++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
++{
++ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
++}
++
++void gpu_free(GPU_MEM_PTR_T * const p) {
++ gpu_free_internal(p);
++}
++
++void rpi_mem_gpu_uninit(void)
++{
++ vcsm_exit();
++ bcm_host_deinit();
++}
++
++int rpi_mem_gpu_init(const unsigned int flags)
++{
++ const int wants_cma = bcm_host_is_fkms_active();
++ int use_cma;
++
++ (void)flags;
++
++ if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0)
++ use_cma = 1;
++ else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0)
++ use_cma = 0;
++ else
++ return AVERROR(EINVAL);
++
++ bcm_host_init();
++
++ return use_cma + 1;
++}
++
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
++
++#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
++{
++ rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
++ *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}};
++ return rfe;
++}
++
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
++{
++ // Nothing needed
++}
++
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
++{
++ int rc = 0;
++ if (rfe->v.op_count != 0) {
++ if (vcsm_clean_invalid2(&rfe->v) != 0)
++ {
++ const int err = errno;
++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err);
++ rc = AVERROR(err);
++ }
++ rfe->v.op_count = 0;
++ }
++ return rc;
++}
++
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
++{
++ int rc = rpi_cache_flush_execute(rfe);;
++
++ return rc;
++}
++
++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
++{
++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++
++ av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
++
++ b->invalidate_mode = mode;
++ b->block_count = blocks;
++ b->start_address = gm->arm + offset0;
++ b->block_size = block_size;
++ b->inter_block_stride = block_stride;
++}
++
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset, const unsigned int size)
++{
++ // Deal with empty pointer trivially
++ if (gm == NULL || size == 0)
++ return;
++
++ av_assert1(offset <= gm->numbytes);
++ av_assert1(size <= gm->numbytes);
++ av_assert1(offset + size <= gm->numbytes);
++
++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
++}
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
++}
++
++
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
++#endif
++ if (gpu_is_buf1(frame)) {
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
++ }
++ else
++ {
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
++ }
++}
++
++// Flush an area of a frame
++// Width, height, x0, y0 in luma pels
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++ const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++ const unsigned int y_offset = frame->linesize[0] * y0;
++ const unsigned int y_size = frame->linesize[0] * height;
++ // Round UV up/down to get everything
++ const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
++ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
++
++#if 0
++ // *** frame->height is cropped height so not good
++ // As all unsigned they will also reject -ve
++ // Test individually as well as added to reject overflow
++ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped
++ av_assert0(n <= (unsigned int)frame->height);
++ av_assert0(start_line + n <= (unsigned int)frame->height);
++#endif
++
++ if (!gpu_is_buf1(frame))
++ {
++ if (do_luma) {
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++ }
++ if (do_chroma) {
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++ }
++ }
++ else if (!av_rpi_is_sand_frame(frame))
++ {
++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++ if (do_luma) {
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++ }
++ if (do_chroma) {
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++ }
++ }
++ else
++ {
++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++ const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
++ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
++ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C
++ av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
++
++ if (do_chroma)
++ {
++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++ b->invalidate_mode = mode;
++ b->block_count = block_count;
++ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
++ b->block_size = uv_size;
++ b->inter_block_stride = stride1 * stride2;
++ }
++ if (do_luma)
++ {
++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++ b->invalidate_mode = mode;
++ b->block_count = block_count;
++ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
++ b->block_size = y_size;
++ b->inter_block_stride = stride1 * stride2;
++ }
++ }
++}
++
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
++{
++ rpi_cache_buf_t cbuf;
++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
++ rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++ rpi_cache_flush_finish(rfe);
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_mem.h
+@@ -0,0 +1,88 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_MEM_H
++#define RPI_MEM_H
++
++typedef struct gpu_mem_ptr_s {
++ unsigned char *arm; // Pointer to memory mapped on ARM side
++ int vc_handle; // Videocore handle of relocatable memory
++ int vcsm_handle; // Handle for use by VCSM
++ int vc; // Address for use in GPU code
++ int numbytes; // Size of memory block
++} GPU_MEM_PTR_T;
++
++// General GPU functions
++
++#define GPU_INIT_GPU 1
++#define GPU_INIT_CMA 2
++
++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
++int rpi_mem_gpu_init(const unsigned int flags);
++void rpi_mem_gpu_uninit(void);
++
++// Cache flush stuff
++
++struct rpi_cache_flush_env_s;
++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
++
++typedef struct {uint32_t t[33];} rpi_cache_buf_t;
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & clear but do not free the env
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
++
++typedef enum
++{
++ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1,
++ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2,
++ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3
++} rpi_cache_flush_mode_t;
++
++struct AVFrame;
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++ const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++ const unsigned int uv_shift, const int do_luma, const int do_chroma);
++
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_qpu.c
+@@ -0,0 +1,776 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++#include "libavutil/avassert.h"
++
++#include "config.h"
++
++#include <pthread.h>
++#include <time.h>
++
++#include <interface/vcsm/user-vcsm.h>
++
++#include "rpi_mailbox.h"
++#include "rpi_mem.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_transform8.h"
++#include "rpi_hevc_transform10.h"
++#include "libavutil/rpi_sand_fns.h"
++
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT 0
++
++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
++// Beware this is expensive and will probably throw off all other timing by >10%
++#define RPI_TRACE_QPU_PROFILE_ALL 0
++
++// QPU "noflush" flags
++// a mixture of flushing & profiling
++
++#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed)
++
++#define vcos_verify_ge0(x) ((x)>=0)
++
++// Size in 32bit words
++#define QPU_CODE_SIZE 4098
++#define VPU_CODE_SIZE 16384
++
++static const short rpi_transMatrix2even[32][16] = { // Even rows first
++{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
++{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90},
++{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89},
++{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87},
++{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83},
++{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80},
++{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75},
++{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70},
++{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64},
++{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57},
++{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50},
++{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43},
++{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36},
++{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25},
++{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18},
++{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9},
++// Odd rows
++{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4},
++{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
++{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22},
++{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31},
++{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38},
++{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46},
++{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54},
++{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61},
++{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67},
++{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73},
++{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78},
++{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82},
++{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85},
++{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88},
++{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90},
++{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90}
++};
++
++// Code/constants on GPU
++struct GPU
++{
++// unsigned int qpu_code[QPU_CODE_SIZE];
++ unsigned int vpu_code8[VPU_CODE_SIZE];
++ unsigned int vpu_code10[VPU_CODE_SIZE];
++ short transMatrix2even[16*16*2];
++};
++
++#define WAIT_COUNT_MAX 16
++
++typedef struct trace_time_one_s
++{
++ int count;
++ int64_t start[WAIT_COUNT_MAX];
++ int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
++
++typedef struct trace_time_wait_s
++{
++ unsigned int jcount;
++ int64_t start0;
++ int64_t last_update;
++ trace_time_one_t active;
++ trace_time_one_t wait;
++} trace_time_wait_t;
++
++typedef struct vq_wait_s
++{
++ sem_t sem;
++ struct vq_wait_s * next;
++} vq_wait_t;
++
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++ vq_wait_t * head;
++ vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
++
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
++
++typedef struct gpu_env_s
++{
++ int open_count;
++ int init_count;
++ int vpu_i_cache_flushed;
++ GPU_MEM_PTR_T qpu_code_gm_ptr;
++ GPU_MEM_PTR_T code_gm_ptr;
++ GPU_MEM_PTR_T dummy_gm_ptr;
++ vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
++
++// Stop more than one thread trying to allocate memory or use the processing resources at once
++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
++static gpu_env_t * gpu = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++
++static int64_t ns_time(void)
++{
++ struct timespec ts;
++ clock_gettime(CLOCK_MONOTONIC, &ts);
++ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
++}
++
++
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
++
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
++
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++ // Update totals for levels that are still pending
++ for (int i = 0; i < tto->count; ++i) {
++ tto->total[i] += now - tto->start[i];
++ tto->start[i] = now;
++ }
++
++ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++ prefix,
++ T_ARG(now - start0 - tto->total[0]),
++ T_ARG(tto->total[0]),
++ T_ARG(tto->total[1]),
++ T_ARG(tto->total[2]),
++ T_ARG(tto->total[3]));
++}
++
++
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++ av_assert0(tto->count < WAIT_COUNT_MAX);
++ tto->start[tto->count++] = now;
++}
++
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++ const int n = --tto->count;
++ av_assert0(n >= 0);
++ tto->total[n] += now - tto->start[n];
++}
++
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++ tto_print(&ttw->active, now, ttw->start0, "Active");
++ tto_print(&ttw->wait, now, ttw->start0, " Wait");
++}
++
++#endif
++
++// GPU memory alloc fns (internal)
++
++static void gpu_free_internal(GPU_MEM_PTR_T * const p)
++{
++ if (p->arm != NULL)
++ vcsm_unlock_ptr(p->arm);
++ if (p->vcsm_handle != 0)
++ vcsm_free(p->vcsm_handle);
++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again
++}
++
++
++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
++ const int numbytes, const unsigned int cache_type, const char * const name)
++{
++ memset(p, 0, sizeof(*p));
++ p->numbytes = (numbytes + 255) & ~255; // Round up
++
++ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
++ (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
++ (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
++ (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
++ {
++ gpu_free_internal(p);
++ return AVERROR(ENOMEM);
++ }
++ return 0;
++}
++
++
++// GPU init, free, lock, unlock
++
++static void gpu_term(void)
++{
++ gpu_env_t * const ge = gpu;
++
++ // We have to hope that eveything has terminated...
++ gpu = NULL;
++
++ vc_gpuserv_deinit();
++
++ gpu_free_internal(&ge->code_gm_ptr);
++ gpu_free_internal(&ge->qpu_code_gm_ptr);
++ gpu_free_internal(&ge->dummy_gm_ptr);
++
++ vcsm_exit();
++
++ vq_wait_pool_deinit(&ge->wait_pool);
++
++ free(ge);
++}
++
++
++// Connect to QPU, returns 0 on success.
++static int gpu_init(gpu_env_t ** const gpu) {
++ volatile struct GPU* ptr;
++ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++ int rv;
++ *gpu = NULL;
++
++ if (ge == NULL)
++ return -1;
++
++ vq_wait_pool_init(&ge->wait_pool);
++
++ vcsm_init();
++
++ // Now copy over the QPU code into GPU memory
++ if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
++ return rv;
++
++ {
++ int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
++ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
++ memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
++ memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
++ }
++
++ // And the VPU code
++ if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
++ return rv;
++ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
++
++ // Zero everything so we have zeros between the code bits
++ memset((void *)ptr, 0, sizeof(*ptr));
++ {
++ int num_bytes = sizeof(rpi_hevc_transform8);
++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
++ }
++ {
++ int num_bytes = sizeof(rpi_hevc_transform10);
++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
++ }
++ // And the transform coefficients
++ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
++
++ // Generate a dummy "frame" & fill with 0x80
++ // * Could reset to 1 <<bit_depth?
++ if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
++ return rv;
++ memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
++
++ *gpu = ge;
++ return 0;
++}
++
++
++
++static void gpu_unlock(void) {
++ pthread_mutex_unlock(&gpu_mutex);
++}
++
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++ pthread_mutex_lock(&gpu_mutex);
++
++ av_assert1(gpu != NULL);
++ return gpu;
++}
++
++static gpu_env_t * gpu_lock_ref(void)
++{
++ pthread_mutex_lock(&gpu_mutex);
++
++ if (gpu == NULL) {
++ int rv = gpu_init(&gpu);
++ if (rv != 0) {
++ gpu_unlock();
++ return NULL;
++ }
++ }
++
++ ++gpu->open_count;
++ return gpu;
++}
++
++static void gpu_unlock_unref(gpu_env_t * const ge)
++{
++ if (--ge->open_count == 0)
++ gpu_term();
++
++ gpu_unlock();
++}
++
++static inline gpu_env_t * gpu_ptr(void)
++{
++ av_assert1(gpu != NULL);
++ return gpu;
++}
++
++unsigned int vpu_get_fn(const unsigned int bit_depth) {
++ uint32_t a = 0;
++
++ // Make sure that the gpu is initialized
++ av_assert1(gpu != NULL);
++ switch (bit_depth){
++ case 8:
++ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
++ break;
++ case 10:
++ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
++ break;
++ default:
++ av_assert0(0);
++ }
++ return a;
++}
++
++unsigned int vpu_get_constants(void) {
++ av_assert1(gpu != NULL);
++ return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
++}
++
++void gpu_ref(void)
++{
++ gpu_lock_ref();
++ gpu_unlock();
++}
++
++void gpu_unref(void)
++{
++ gpu_env_t * const ge = gpu_lock();
++ gpu_unlock_unref(ge);
++}
++
++// ----------------------------------------------------------------------------
++
++
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
++{
++ unsigned int i;
++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++ sem_init(&wp->pool[i].sem, 0, 0);
++ wp->pool[i].next = wp->pool + i + 1;
++ }
++ wp->head = wp->pool + 0;
++ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
++}
++
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
++{
++ unsigned int i;
++ wp->head = NULL;
++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++ sem_destroy(&wp->pool[i].sem);
++ wp->pool[i].next = NULL;
++ }
++}
++
++
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(void)
++{
++ gpu_env_t * const ge = gpu_lock_ref();
++ vq_wait_t * const wait = ge->wait_pool.head;
++ ge->wait_pool.head = wait->next;
++ wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ tto_start(&ge->ttw.active, ns_time());
++#endif
++
++ gpu_unlock();
++ return wait;
++}
++
++static void vq_wait_delete(vq_wait_t * const wait)
++{
++ gpu_env_t * const ge = gpu_lock();
++ wait->next = ge->wait_pool.head;
++ ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ trace_time_wait_t * const ttw = &ge->ttw;
++ const int64_t now = ns_time();
++ ++ttw->jcount;
++ tto_end(&ttw->wait, now);
++
++ if (ttw->start0 == 0)
++ {
++ ttw->start0 = ttw->active.start[0];
++ ttw->last_update = ttw->start0;
++ }
++ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++ {
++ ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++ ttw_print(ttw, now);
++ }
++ }
++#endif
++ gpu_unlock_unref(ge);
++}
++
++static void vq_wait_wait(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ const int64_t now = ns_time();
++ gpu_env_t * const ge = gpu_lock();
++ tto_start(&ge->ttw.wait, now);
++ gpu_unlock();
++ }
++#endif
++
++ while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++ /* loop */;
++}
++
++static void vq_wait_post(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ gpu_env_t *const ge = gpu_lock();
++ tto_end(&ge->ttw.active, ns_time());
++ gpu_unlock();
++ }
++#endif
++
++ sem_post(&wait->sem);
++}
++
++
++
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU 1
++#define VPU_QPU_MASK_VPU 2
++
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
++{
++// vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++ vpu_qpu_job_env_t * vqj = buf;
++// memset(vqj, 0, sizeof(*vqj));
++ vqj->n = 0;
++ vqj->mask = 0;
++ return vqj;
++}
++
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
++{
++// memset(vqj, 0, sizeof(*vqj));
++// free(vqj);
++}
++
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++ struct gpu_job_s * const j = vqj->j + vqj->n++;
++ av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
++ return j;
++}
++
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++ if (vpu_code != 0) {
++ struct gpu_job_s *const j = new_job(vqj);
++ vqj->mask |= VPU_QPU_MASK_VPU;
++
++ j->command = EXECUTE_VPU;
++ j->callback.func = 0;
++ j->callback.cookie = NULL;
++ // The bottom two bits of the execute address contain no-flush flags
++ // b0 will flush the VPU I-cache if unset so we nearly always want that set
++ // as we never reload code
++ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
++ j->u.v.q[1] = r0;
++ j->u.v.q[2] = r1;
++ j->u.v.q[3] = r2;
++ j->u.v.q[4] = r3;
++ j->u.v.q[5] = r4;
++ j->u.v.q[6] = r5;
++ gpu->vpu_i_cache_flushed = 1;
++ }
++}
++
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
++{
++ if (n != 0) {
++ struct gpu_job_s *const j = new_job(vqj);
++ vqj->mask |= VPU_QPU_MASK_QPU;
++
++ j->command = EXECUTE_QPU;
++ j->callback.func = 0;
++ j->callback.cookie = NULL;
++
++ j->u.q.jobs = n;
++#if RPI_TRACE_QPU_PROFILE_ALL
++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
++#else
++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
++#endif
++ j->u.q.timeout = 5000;
++ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++ }
++}
++
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
++{
++ vq_wait_post(v);
++}
++
++// Poke a user-supplied sem
++static void vpu_qpu_job_callback_sem(void * v)
++{
++ sem_post((sem_t *)v);
++}
++
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
++{
++ vq_wait_t * wait;
++
++ if (vqj->mask == 0) {
++ *wait_h = NULL;
++ return;
++ }
++
++ // We are going to want a sync object
++ wait = vq_wait_new();
++
++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++ // If we only posted one thing or only QPU jobs
++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++ {
++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++ av_assert1(j->callback.func == 0);
++
++ j->callback.func = vpu_qpu_job_callback_wait;
++ j->callback.cookie = wait;
++ }
++ else
++ {
++ struct gpu_job_s *const j = new_job(vqj);
++
++ j->command = EXECUTE_SYNC;
++ j->u.s.mask = vqj->mask;
++ j->callback.func = vpu_qpu_job_callback_wait;
++ j->callback.cookie = wait;
++ }
++
++ vqj->mask = 0;
++ *wait_h = wait;
++}
++
++// Returns 0 if no sync added ('cos Q empty), 1 if sync added
++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
++{
++ // If nothing on q then just return
++ if (vqj->mask == 0)
++ return 0;
++
++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++ // If we only posted one thing or only QPU jobs
++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++ {
++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++ av_assert1(j->callback.func == 0);
++
++ j->callback.func = vpu_qpu_job_callback_sem;
++ j->callback.cookie = sem;
++ }
++ else
++ {
++ struct gpu_job_s *const j = new_job(vqj);
++
++ j->command = EXECUTE_SYNC;
++ j->u.s.mask = vqj->mask;
++ j->callback.func = vpu_qpu_job_callback_sem;
++ j->callback.cookie = sem;
++ }
++
++ vqj->mask = 0;
++ return 1;
++}
++
++
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
++{
++ if (vqj->n == 0)
++ return 0;
++
++ return vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
++
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++ int rv;
++ rv = vpu_qpu_job_start(vqj);
++ vpu_qpu_job_delete(vqj);
++ return rv;
++}
++
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
++{
++ if (wait_h != NULL)
++ {
++ vq_wait_t * const wait = *wait_h;
++ if (wait != NULL) {
++ *wait_h = NULL;
++ vq_wait_wait(wait);
++ vq_wait_delete(wait);
++ }
++ }
++}
++
++int vpu_qpu_init()
++{
++ gpu_env_t * const ge = gpu_lock_ref();
++ if (ge == NULL)
++ return -1;
++
++ if (ge->init_count++ == 0)
++ {
++ vc_gpuserv_init();
++ }
++
++ gpu_unlock();
++ return 0;
++}
++
++void vpu_qpu_term()
++{
++ gpu_env_t * const ge = gpu_lock();
++
++ if (--ge->init_count == 0) {
++ vc_gpuserv_deinit();
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ ttw_print(&ge->ttw, ns_time());
++#endif
++ }
++
++ gpu_unlock_unref(ge);
++}
++
++uint32_t qpu_fn(const int * const mc_fn)
++{
++ return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
++}
++
++uint32_t qpu_dummy(void)
++{
++ return gpu->dummy_gm_ptr.vc;
++}
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
++{
++ // Dummy values we can catch with emulation
++ qf->y_pxx = ~1U;
++ qf->y_bxx = ~2U;
++ qf->y_p00 = ~3U;
++ qf->y_b00 = ~4U;
++ qf->c_pxx = ~5U;
++ qf->c_bxx = ~6U;
++
++ switch (bit_depth) {
++ case 8:
++ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++ qf->y_bxx = qpu_fn(mc_filter_y_bxx);
++ qf->y_p00 = qpu_fn(mc_filter_y_p00);
++ qf->y_b00 = qpu_fn(mc_filter_y_b00);
++ qf->c_pxx = qpu_fn(mc_filter_c_p);
++ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
++ qf->c_bxx = qpu_fn(mc_filter_c_b);
++ break;
++ case 10:
++ qf->c_pxx = qpu_fn(mc_filter_c10_p);
++ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
++ qf->c_bxx = qpu_fn(mc_filter_c10_b);
++ qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
++ qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
++ qf->y_p00 = qpu_fn(mc_filter_y10_p00);
++ qf->y_b00 = qpu_fn(mc_filter_y10_b00);
++ break;
++ default:
++ return -1;
++ }
++ return 0;
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_qpu.h
+@@ -0,0 +1,103 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_QPU_H
++#define RPI_QPU_H
++
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#pragma GCC diagnostic ignored "-Wstrict-prototypes"
++#include "interface/vmcs_host/vc_vchi_gpuserv.h" // for gpu_job_s
++#pragma GCC diagnostic pop
++
++// QPU specific functions
++
++typedef struct HEVCRpiQpu {
++ uint32_t c_pxx;
++ uint32_t c_pxx_l1;
++ uint32_t c_bxx;
++ uint32_t y_pxx;
++ uint32_t y_bxx;
++ uint32_t y_p00;
++ uint32_t y_b00;
++} HEVCRpiQpu;
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
++
++uint32_t qpu_fn(const int * const mc_fn);
++uint32_t qpu_dummy(void);
++
++#define QPU_N_GRP 4
++#define QPU_N_MAX 12
++
++#define QPU_MAIL_EL_VALS 2
++
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
++
++// VPU specific functions
++
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
++
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
++{
++ unsigned int n;
++ unsigned int mask;
++ struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
++
++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
++extern unsigned int vpu_get_constants(void);
++
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
++
++void gpu_ref(void);
++void gpu_unref(void);
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpi_zc.c
+@@ -0,0 +1,1227 @@
++#include "config.h"
++
++#include "libavcodec/avcodec.h"
++#include "rpi_mem.h"
++#include "rpi_mailbox.h"
++#include "rpi_zc.h"
++#include "libavutil/avassert.h"
++#include <pthread.h>
++
++#include "libavutil/buffer_internal.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <interface/vctypes/vc_image_types.h>
++#include <interface/vcsm/user-vcsm.h>
++#pragma GCC diagnostic pop
++
++#define TRACE_ALLOC 0
++#define DEBUG_ALWAYS_KEEP_LOCKED 0
++
++struct ZcPoolEnt;
++
++typedef struct ZcPool
++{
++ size_t numbytes;
++ struct ZcPoolEnt * head;
++ pthread_mutex_t lock;
++} ZcPool;
++
++typedef struct ZcPoolEnt
++{
++ size_t numbytes;
++
++ unsigned int vcsm_handle;
++ unsigned int vc_handle;
++ void * map_arm;
++ unsigned int map_vc;
++
++ struct ZcPoolEnt * next;
++ struct ZcPool * pool;
++} ZcPoolEnt;
++
++typedef struct ZcOldCtxVals
++{
++ int thread_safe_callbacks;
++ int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
++ void * opaque;
++} ZcOldCtxVals;
++
++typedef struct AVZcEnv
++{
++ unsigned int refcount;
++ ZcOldCtxVals old;
++
++ void * pool_env;
++ av_rpi_zc_alloc_buf_fn_t * alloc_buf;
++ av_rpi_zc_free_pool_fn_t * free_pool;
++
++ unsigned int pool_size;
++} ZcEnv;
++
++typedef struct ZcUserBufEnv {
++ void * v;
++ const av_rpi_zc_buf_fn_tab_t * fn;
++ size_t numbytes;
++ int offset;
++} ZcUserBufEnv;
++
++#define ZC_BUF_INVALID 0
++#define ZC_BUF_VALID 1
++#define ZC_BUF_NEVER 2
++
++typedef struct ZcBufEnv {
++ GPU_MEM_PTR_T gmem;
++ AVZcEnvPtr zc;
++ int is_valid;
++ AVBufferRef * user;
++ AVRpiZcFrameGeometry geo;
++ size_t size_y;
++ size_t size_c;
++ size_t size_pic;
++ ssize_t offset;
++ pthread_mutex_t lock;
++ pthread_cond_t cond;
++} ZcBufEnv;
++
++
++
++
++
++
++#define ALLOC_PAD 0
++#define ALLOC_ROUND 0x1000
++#define STRIDE_ROUND 64
++#define STRIDE_OR 0
++
++#define DEBUG_ZAP0_BUFFERS 0
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) ||
++ (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++ return av_rpi_is_sand_format(frame->format);
++}
++
++//----------------------------------------------------------------------------
++//
++// Internal pool stuff
++
++// Pool entry functions
++
++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size)
++{
++ ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt));
++
++ // Round up to 4k & add 4k
++ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
++
++ if (zp == NULL) {
++ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
++ goto fail0;
++ }
++
++ // The 0x80 here maps all pages here rather than waiting for lazy mapping
++ // BEWARE that in GPU land a later unlock/lock pair will put us back into
++ // lazy mode - which will also break cache invalidate calls.
++ if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
++ goto fail1;
++ }
++
++#if TRACE_ALLOC
++ printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle);
++#endif
++
++ zp->numbytes = alloc_size;
++ zp->pool = pool;
++ return zp;
++
++fail1:
++ av_free(zp);
++fail0:
++ return NULL;
++}
++
++static void zc_pool_ent_free(ZcPoolEnt * const zp)
++{
++#if TRACE_ALLOC
++ printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle);
++#endif
++
++ if (zp->vcsm_handle != 0)
++ {
++ // VC addr & handle need no dealloc
++ if (zp->map_arm != NULL)
++ vcsm_unlock_hdl(zp->vcsm_handle);
++ vcsm_free(zp->vcsm_handle);
++ }
++ av_free(zp);
++}
++
++//----------------------------------------------------------------------------
++//
++// Pool functions
++
++static void zc_pool_free_ent_list(ZcPoolEnt * p)
++{
++ while (p != NULL)
++ {
++ ZcPoolEnt * const zp = p;
++ p = p->next;
++ zc_pool_ent_free(zp);
++ }
++}
++
++static void zc_pool_flush(ZcPool * const pool)
++{
++ ZcPoolEnt * p = pool->head;
++ pool->head = NULL;
++ pool->numbytes = ~0U;
++ zc_pool_free_ent_list(p);
++}
++
++static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes)
++{
++ ZcPoolEnt * zp = NULL;
++ ZcPoolEnt * flush_list = NULL;
++ size_t numbytes;
++
++ pthread_mutex_lock(&pool->lock);
++
++ numbytes = pool->numbytes;
++
++ // If size isn't close then dump the pool
++ // Close in this context means within 128k
++ if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
++ {
++ flush_list = pool->head;
++ pool->head = NULL;
++ pool->numbytes = numbytes = req_bytes;
++ }
++ else if (pool->head != NULL)
++ {
++ zp = pool->head;
++ pool->head = zp->next;
++ }
++
++ pthread_mutex_unlock(&pool->lock);
++
++ zc_pool_free_ent_list(flush_list);
++
++ if (zp == NULL)
++ zp = zc_pool_ent_alloc(pool, numbytes);
++
++ return zp;
++}
++
++static void zc_pool_put_ent(ZcPoolEnt * const zp)
++{
++ ZcPool * const pool = zp == NULL ? NULL : zp->pool;
++ if (zp != NULL)
++ {
++ pthread_mutex_lock(&pool->lock);
++#if TRACE_ALLOC
++ printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes);
++#endif
++
++ if (pool->numbytes == zp->numbytes)
++ {
++ zp->next = pool->head;
++ pool->head = zp;
++ pthread_mutex_unlock(&pool->lock);
++ }
++ else
++ {
++ pthread_mutex_unlock(&pool->lock);
++ zc_pool_ent_free(zp);
++ }
++ }
++}
++
++static ZcPool *
++zc_pool_new(void)
++{
++ ZcPool * const pool = av_mallocz(sizeof(*pool));
++ if (pool == NULL)
++ return NULL;
++
++ pool->numbytes = -1;
++ pool->head = NULL;
++ pthread_mutex_init(&pool->lock, NULL);
++ return pool;
++}
++
++static void
++zc_pool_delete(ZcPool * const pool)
++{
++ if (pool != NULL)
++ {
++ pool->numbytes = -1;
++ zc_pool_flush(pool);
++ pthread_mutex_destroy(&pool->lock);
++ av_free(pool);
++ }
++}
++
++//============================================================================
++//
++// ZC implementation using above pool implementation
++//
++// Fn table fns...
++
++static void zc_pool_free_v(void * v)
++{
++ zc_pool_put_ent(v);
++}
++
++static unsigned int zc_pool_ent_vcsm_handle_v(void * v)
++{
++ ZcPoolEnt * zp = v;
++ return zp->vcsm_handle;
++}
++
++static unsigned int zc_pool_ent_vc_handle_v(void * v)
++{
++ ZcPoolEnt * zp = v;
++ if (zp->vc_handle == 0)
++ {
++ if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0)
++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n",
++ __func__, zp->vcsm_handle);
++ }
++ return zp->vc_handle;
++}
++
++static void * zc_pool_ent_map_arm_v(void * v)
++{
++ ZcPoolEnt * zp = v;
++ if (zp->map_arm == NULL)
++ {
++ if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL)
++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n",
++ __func__, zp->vcsm_handle);
++ }
++ return zp->map_arm;
++}
++
++static unsigned int zc_pool_ent_map_vc_v(void * v)
++{
++ ZcPoolEnt * zp = v;
++ if (zp->map_vc == 0)
++ {
++ if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0)
++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n",
++ __func__, zp->vcsm_handle);
++ }
++ return zp->map_vc;
++}
++
++static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = {
++ .free = zc_pool_free_v,
++ .vcsm_handle = zc_pool_ent_vcsm_handle_v,
++ .vc_handle = zc_pool_ent_vc_handle_v,
++ .map_arm = zc_pool_ent_map_arm_v,
++ .map_vc = zc_pool_ent_map_vc_v,
++};
++
++// ZC Env fns
++
++// Delete pool
++// All buffers guaranteed freed by now
++static void
++zc_pool_delete_v(void * v)
++{
++ zc_pool_delete((ZcPool *)v);
++ rpi_mem_gpu_uninit();
++}
++
++// Allocate a new ZC buffer
++static AVBufferRef *
++zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
++{
++ ZcPool * const pool = v;
++ ZcPoolEnt *const zp = zc_pool_get_ent(pool, size);
++ AVBufferRef * buf;
++
++ (void)geo; // geo ignored here
++
++ if (zp == NULL) {
++ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
++ goto fail0;
++ }
++
++ if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n");
++ goto fail2;
++ }
++
++ return buf;
++
++fail2:
++ zc_pool_put_ent(zp);
++fail0:
++ return NULL;
++}
++
++// Init wrappers - the public fns
++
++AVZcEnvPtr
++av_rpi_zc_int_env_alloc(void * logctx)
++{
++ ZcEnv * zc;
++ ZcPool * pool_env;
++
++ if (rpi_mem_gpu_init(0) < 0)
++ return NULL;
++
++ if ((pool_env = zc_pool_new()) == NULL)
++ goto fail1;
++
++ if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL)
++ goto fail2;
++
++ return zc;
++
++fail2:
++ zc_pool_delete(pool_env);
++fail1:
++ rpi_mem_gpu_uninit();
++ return NULL;
++}
++
++void
++av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp)
++{
++ const AVZcEnvPtr zc = *zcp;
++ *zcp = NULL;
++ if (zc != NULL)
++ av_rpi_zc_env_release(zc);
++}
++
++//============================================================================
++//
++// Geometry
++//
++// This is a separate chunck to the rest
++
++// Get mailbox fd - should be in a lock when called
++// Rely on process close to close it
++static int mbox_fd(void)
++{
++ static int fd = -1;
++ if (fd != -1)
++ return fd;
++ return (fd = mbox_open());
++}
++
++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++ const int format, const unsigned int video_width, const unsigned int video_height)
++{
++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++
++ AVRpiZcFrameGeometry geo = {
++ .format = format,
++ .video_width = video_width,
++ .video_height = video_height
++ };
++
++ switch (format)
++ {
++ case AV_PIX_FMT_YUV420P:
++ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++ geo.stride_c = geo.stride_y / 2;
++ geo.height_y = (video_height + 32 + 31) & ~31;
++ geo.height_c = geo.height_y / 2;
++ geo.planes_c = 2;
++ geo.stripes = 1;
++ geo.bytes_per_pel = 1;
++ geo.stripe_is_yc = 1;
++ break;
++
++ case AV_PIX_FMT_YUV420P10:
++ geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++ geo.stride_c = geo.stride_y / 2;
++ geo.height_y = (video_height + 32 + 31) & ~31;
++ geo.height_c = geo.height_y / 2;
++ geo.planes_c = 2;
++ geo.stripes = 1;
++ geo.bytes_per_pel = 2;
++ geo.stripe_is_yc = 1;
++ break;
++
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ {
++ const unsigned int stripe_w = 128;
++
++ static VC_IMAGE_T img = {0};
++
++ // Given the overhead of calling the mailbox keep a stashed
++ // copy as we will almost certainly just want the same numbers again
++ // but that means we need a lock
++ pthread_mutex_lock(&sand_lock);
++
++ if (img.width != video_width || img.height != video_height)
++ {
++ VC_IMAGE_T new_img = {
++ .type = VC_IMAGE_YUV_UV,
++ .width = video_width,
++ .height = video_height
++ };
++
++ mbox_get_image_params(mbox_fd(), &new_img);
++ img = new_img;
++ }
++
++ geo.stride_y = stripe_w;
++ geo.stride_c = stripe_w;
++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++ geo.height_c = img.pitch / stripe_w - geo.height_y;
++ geo.stripe_is_yc = 1;
++ if (geo.height_y * stripe_w > img.pitch)
++ {
++ // "tall" sand - all C blocks now follow Y
++ geo.height_y = img.pitch / stripe_w;
++ geo.height_c = geo.height_y;
++ geo.stripe_is_yc = 0;
++ }
++ geo.planes_c = 1;
++ geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++ geo.bytes_per_pel = 1;
++
++ pthread_mutex_unlock(&sand_lock);
++#if 0
++ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
++ video_width, video_height,
++ geo.stride_y, geo.stride_c,
++ geo.height_y, geo.height_c,
++ geo.stripes, img.pitch);
++#endif
++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++ break;
++ }
++
++ case AV_PIX_FMT_RPI4_10:
++ {
++ const unsigned int stripe_w = 128; // bytes
++
++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++ static VC_IMAGE_T img = {0};
++
++ // Given the overhead of calling the mailbox keep a stashed
++ // copy as we will almost certainly just want the same numbers again
++ // but that means we need a lock
++ pthread_mutex_lock(&sand_lock);
++
++ if (img.width != video_width || img.height != video_height)
++ {
++ VC_IMAGE_T new_img = {
++ .type = VC_IMAGE_YUV10COL,
++ .width = video_width,
++ .height = video_height
++ };
++
++ mbox_get_image_params(mbox_fd(), &new_img);
++ img = new_img;
++ }
++
++ geo.stride_y = stripe_w;
++ geo.stride_c = stripe_w;
++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++ geo.height_c = img.pitch / stripe_w - geo.height_y;
++ geo.planes_c = 1;
++ geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w;
++ geo.bytes_per_pel = 1;
++ geo.stripe_is_yc = 1;
++
++ pthread_mutex_unlock(&sand_lock);
++
++#if 0
++ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
++ video_width, video_height,
++ geo.stride_y, geo.stride_c,
++ geo.height_y, geo.height_c,
++ geo.stripes, img.pitch);
++#endif
++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++ break;
++ }
++
++ case AV_PIX_FMT_SAND64_16:
++ case AV_PIX_FMT_SAND64_10:
++ {
++ const unsigned int stripe_w = 128; // bytes
++
++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++ static VC_IMAGE_T img = {0};
++
++ // Given the overhead of calling the mailbox keep a stashed
++ // copy as we will almost certainly just want the same numbers again
++ // but that means we need a lock
++ pthread_mutex_lock(&sand_lock);
++
++ if (img.width != video_width || img.height != video_height)
++ {
++ VC_IMAGE_T new_img = {
++ .type = VC_IMAGE_YUV_UV_16,
++ .width = video_width,
++ .height = video_height
++ };
++
++ mbox_get_image_params(mbox_fd(), &new_img);
++ img = new_img;
++ }
++
++ geo.stride_y = stripe_w;
++ geo.stride_c = stripe_w;
++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++ geo.height_c = img.pitch / stripe_w - geo.height_y;
++ geo.planes_c = 1;
++ geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
++ geo.bytes_per_pel = 2;
++ geo.stripe_is_yc = 1;
++
++ pthread_mutex_unlock(&sand_lock);
++ break;
++ }
++
++ default:
++ break;
++ }
++ return geo;
++}
++
++//============================================================================
++//
++// ZC Env fns
++//
++// Frame copy fns
++
++static AVBufferRef * zc_copy(const AVZcEnvPtr zc,
++ const AVFrame * const src)
++{
++ AVFrame dest_frame;
++ AVFrame * const dest = &dest_frame;
++ unsigned int i;
++ uint8_t * psrc, * pdest;
++
++ dest->format = src->format;
++ dest->width = src->width;
++ dest->height = src->height;
++
++ if (av_rpi_zc_get_buffer(zc, dest) != 0 ||
++ av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0)
++ {
++ return NULL;
++ }
++
++ for (i = 0, psrc = src->data[0], pdest = dest->data[0];
++ i != dest->height;
++ ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
++ {
++ memcpy(pdest, psrc, dest->width);
++ }
++ for (i = 0, psrc = src->data[1], pdest = dest->data[1];
++ i != dest->height / 2;
++ ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
++ {
++ memcpy(pdest, psrc, dest->width / 2);
++ }
++ for (i = 0, psrc = src->data[2], pdest = dest->data[2];
++ i != dest->height / 2;
++ ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
++ {
++ memcpy(pdest, psrc, dest->width / 2);
++ }
++
++ return dest->buf[0];
++}
++
++
++static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc,
++ const AVFrame * const src)
++{
++ assert(0);
++ return NULL;
++}
++
++
++static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc,
++ const AVFrame * const src, const unsigned int src_bits)
++{
++ assert(0);
++ return NULL;
++}
++
++//----------------------------------------------------------------------------
++//
++// Public info extraction calls
++
++static void zc_buf_env_free_cb(void * opaque, uint8_t * data);
++
++static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf)
++{
++ // Kludge where we check the free fn to check this is really
++ // one of our buffers - can't think of a better way
++ return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL :
++ av_buffer_get_opaque(buf);
++}
++
++static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
++{
++ // As gmem is the first el NULL should be preserved
++ return &pic_zbe_ptr(buf)->gmem;
++}
++
++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref)
++{
++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++ return p == NULL ? 0 : p->vcsm_handle;
++}
++
++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
++{
++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++ return p == NULL ? -1 : p->vc_handle;
++}
++
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
++{
++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++ return zbe == NULL ? 0 : zbe->offset;
++}
++
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
++{
++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++ return zbe == NULL ? 0 : zbe->size_pic;
++}
++
++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
++{
++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++ return p == NULL ? 0 : p->numbytes;
++}
++
++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref)
++{
++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++ return zbe == NULL ? NULL : &zbe->geo;
++}
++
++AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc,
++ const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
++{
++ av_assert0(!maycopy || zc != NULL);
++
++ if (frame->format != AV_PIX_FMT_YUV420P &&
++ frame->format != AV_PIX_FMT_YUV420P10 &&
++ !av_rpi_is_sand_frame(frame))
++ {
++ av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
++ return NULL;
++ }
++
++ if (frame->buf[1] != NULL || frame->format != expected_format)
++ {
++#if RPI_ZC_SAND_8_IN_10_BUF
++ if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
++ {
++// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
++ return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
++ }
++#endif
++
++ if (maycopy)
++ {
++ if (frame->buf[1] != NULL)
++ av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
++ else
++ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
++
++ switch (frame->format)
++ {
++ case AV_PIX_FMT_YUV420P10:
++ return zc_420p10_to_sand128(zc, frame);
++
++ case AV_PIX_FMT_SAND64_10:
++ return zc_sand64_16_to_sand128(zc, frame, 10);
++
++ default:
++ return zc_copy(zc, frame);
++ }
++ }
++ else
++ {
++ if (frame->buf[1] != NULL)
++ av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
++ else
++ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
++ return NULL;
++ }
++ }
++
++ if (pic_gm_ptr(frame->buf[0]) == NULL)
++ {
++ if (maycopy)
++ {
++ av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
++ return zc_copy(zc, frame);
++ }
++ else
++ {
++ av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
++ return NULL;
++ }
++ }
++
++ return av_buffer_ref(frame->buf[0]);
++}
++
++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
++{
++ if (fr_ref != NULL)
++ {
++ av_buffer_unref(&fr_ref);
++ }
++}
++
++//----------------------------------------------------------------------------
++
++// Extract user environment from an AVBufferRef
++void * av_rpi_zc_buf_v(AVBufferRef * const buf)
++{
++ ZcBufEnv * const zbe = pic_zbe_ptr(buf);
++ if (zbe != NULL && zbe->user != NULL)
++ {
++ const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data;
++ return zub == NULL ? NULL : zub->v;
++ }
++ return NULL;
++}
++
++// AV buffer pre-free callback
++static void zc_user_buf_free_cb(void * opaque, uint8_t * data)
++{
++ if (opaque != NULL)
++ {
++ ZcUserBufEnv * const zub = opaque;
++
++ if (zub->fn->free)
++ zub->fn->free(zub->v);
++
++ av_free(zub);
++ }
++}
++
++static void zc_buf_env_free_cb(void * opaque, uint8_t * data)
++{
++ if (opaque != NULL)
++ {
++ ZcBufEnv * const zbe = opaque;
++
++ av_buffer_unref(&zbe->user);
++
++ if (zbe->zc != NULL)
++ av_rpi_zc_env_release(zbe->zc);
++
++ pthread_cond_destroy(&zbe->cond);
++ pthread_mutex_destroy(&zbe->lock);
++ av_free(zbe);
++ }
++}
++
++
++// Wrap the various ZC bits in an AV Buffer and resolve those things we want
++// resolved now.
++// Currently we resolve everything, but in future we might not
++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab)
++{
++ AVBufferRef *buf;
++ ZcUserBufEnv * zub;
++
++ if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL)
++ return NULL;
++
++ zub->fn = fn_tab;
++ zub->v = v;
++ zub->numbytes = numbytes;
++ zub->offset = addr_offset;
++
++ if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n");
++ av_free(zub);
++ return NULL;
++ }
++
++ return buf;
++}
++
++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode)
++{
++ ZcBufEnv * const zbe = pic_zbe_ptr(buf);
++
++ if (zbe == NULL)
++ return AVERROR(EINVAL);
++
++ if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid)
++ return AVERROR(EAGAIN);
++
++ if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid)
++ {
++ pthread_mutex_lock(&zbe->lock);
++ while (!zbe->is_valid)
++ pthread_cond_wait(&zbe->cond, &zbe->lock);
++ pthread_mutex_unlock(&zbe->lock);
++ }
++
++ if (zbe->is_valid == ZC_BUF_NEVER)
++ return AVERROR(EINVAL);
++
++ // Do alloc if we need it
++ if (zbe->user == NULL)
++ {
++ ZcEnv * const zc = zbe->zc;
++ const ZcUserBufEnv * zub;
++
++ av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID);
++
++ if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++ goto fail;
++ }
++ zub = (const ZcUserBufEnv *)zbe->user->data;
++
++ // Track
++
++ zbe->offset = zub->offset;
++ zbe->gmem.numbytes = zub->numbytes;
++ if ((zbe->gmem.arm = zub->fn->map_arm(zub->v)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++ goto fail;
++ }
++
++ if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n");
++ goto fail;
++ }
++
++ if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++ goto fail;
++ }
++ if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++ goto fail;
++ }
++
++ buf->buffer->data = zbe->gmem.arm + zbe->offset;
++ buf->buffer->size = zbe->size_pic;
++
++ // In this mode we shouldn't have anyone waiting for us
++ // so no need to signal
++ if (alloc_mode == ZC_RESOLVE_ALLOC_VALID)
++ zbe->is_valid = 1;
++ }
++
++ // Just overwrite - no point in testing
++ buf->data = zbe->gmem.arm + zbe->offset;
++ buf->size = zbe->size_pic;
++ return 0;
++
++fail:
++ av_buffer_unref(&zbe->user);
++ return AVERROR(ENOMEM);
++}
++
++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc)
++{
++ int rv;
++
++ // Do alloc if we need it
++ if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0)
++ return rv;
++
++ // If we are a framebuf copy then the alloc can be done but we haven't
++ // imported its results yet
++ if (frame->data[0] == NULL)
++ {
++ const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++ frame->linesize[0] = zbe->geo.stride_y;
++ frame->linesize[1] = zbe->geo.stride_c;
++ frame->linesize[2] = zbe->geo.stride_c;
++ // abuse: linesize[3] = "stripe stride"
++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++ // In a general case this makes the calculation an xor and multiply rather
++ // than a divide and multiply
++ if (zbe->geo.stripes > 1)
++ frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y;
++
++ frame->data[0] = frame->buf[0]->data;
++ frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes);
++ if (zbe->geo.planes_c > 1)
++ frame->data[2] = frame->data[1] + zbe->size_c;
++
++ frame->extended_data = frame->data;
++ // Leave extended buf alone
++ }
++
++ return 0;
++}
++
++int av_rpi_zc_set_valid_frame(AVFrame * const frame)
++{
++ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++ if (zbe == NULL)
++ return AVERROR(EINVAL);
++
++ zbe->is_valid = ZC_BUF_VALID;
++ pthread_cond_broadcast(&zbe->cond);
++
++ return 0;
++}
++
++int av_rpi_zc_set_broken_frame(AVFrame * const frame)
++{
++ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++ if (zbe == NULL)
++ return AVERROR(EINVAL);
++
++ zbe->is_valid = ZC_BUF_NEVER;
++ pthread_cond_broadcast(&zbe->cond);
++
++ return 0;
++}
++
++void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size)
++{
++ zc->pool_size = pool_size;
++}
++
++unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc)
++{
++ return zc->pool_size;
++}
++
++int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame)
++{
++#if 1
++ ZcBufEnv * zbe = av_mallocz(sizeof(*zbe));
++
++ for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) {
++ frame->buf[i] = NULL;
++ frame->data[i] = NULL;
++ frame->linesize[i] = 0;
++ }
++
++ if (zbe == NULL)
++ return AVERROR(ENOMEM);
++
++ if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL)
++ {
++ av_free(zbe);
++ return AVERROR(ENOMEM);
++ }
++
++ pthread_mutex_init(&zbe->lock, NULL);
++ pthread_cond_init(&zbe->cond, NULL);
++ zbe->zc = zc;
++ atomic_fetch_add(&zc->refcount, 1);
++
++ zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); // Note geometry for later use
++ zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y;
++ zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c;
++ zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes;
++
++#else
++ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
++ const unsigned int size_y = geo.stride_y * geo.height_y;
++ const unsigned int size_c = geo.stride_c * geo.height_c;
++ const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
++ AVBufferRef * buf;
++ unsigned int i;
++
++// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
++
++ if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL)
++ {
++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++ return AVERROR(ENOMEM);
++ }
++
++ // Track
++ atomic_fetch_add(&zc->refcount, 1);
++ pic_zbe_ptr(buf)->zc = zc;
++
++ for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
++ frame->buf[i] = NULL;
++ frame->data[i] = NULL;
++ frame->linesize[i] = 0;
++ }
++
++ frame->buf[0] = buf;
++
++ frame->linesize[0] = geo.stride_y;
++ frame->linesize[1] = geo.stride_c;
++ frame->linesize[2] = geo.stride_c;
++ // abuse: linesize[3] = "stripe stride"
++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++ // In a general case this makes the calculation an xor and multiply rather
++ // than a divide and multiply
++ if (geo.stripes > 1)
++ frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y;
++
++ frame->data[0] = buf->data;
++ frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes);
++ if (geo.planes_c > 1)
++ frame->data[2] = frame->data[1] + size_c;
++
++ frame->extended_data = frame->data;
++ // Leave extended buf alone
++
++#if RPI_ZC_SAND_8_IN_10_BUF != 0
++ // *** If we intend to use this for real we will want a 2nd buffer pool
++ frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge
++#endif
++#endif
++
++ return 0;
++}
++
++void av_rpi_zc_env_release(const AVZcEnvPtr zc)
++{
++ const int n = atomic_fetch_add(&zc->refcount, -1);
++ if (n == 1) // was 1, now 0
++ {
++ zc->free_pool(zc->pool_env);
++ av_free(zc);
++ }
++}
++
++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
++ void * pool_env,
++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++ av_rpi_zc_free_pool_fn_t * free_pool_fn)
++{
++ ZcEnv * zc;
++
++ if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL)
++ {
++ av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
++ return NULL;
++ }
++
++ *zc = (ZcEnv){
++ .refcount = ATOMIC_VAR_INIT(1),
++ .pool_env = pool_env,
++ .alloc_buf = alloc_buf_fn,
++ .free_pool = free_pool_fn,
++ .pool_size = 0
++ };
++
++ return zc;
++}
++
++//============================================================================
++//
++// External ZC initialisation
++
++#define RPI_GET_BUFFER2 1
++
++
++static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
++{
++#if !RPI_GET_BUFFER2
++ return avcodec_default_get_buffer2(s, frame, flags);
++#else
++ int rv;
++
++ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
++ {
++// printf("Do default alloc: format=%#x\n", frame->format);
++ rv = avcodec_default_get_buffer2(s, frame, flags);
++ }
++ else if (frame->format == AV_PIX_FMT_YUV420P ||
++ av_rpi_is_sand_frame(frame))
++ {
++ if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0)
++ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);
++ }
++ else
++ {
++ rv = avcodec_default_get_buffer2(s, frame, flags);
++ }
++
++#if 0
++ printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
++ frame->format, frame->width, frame->height,
++ frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
++ frame->data[0], frame->data[1], frame->data[2],
++ frame->buf[0], frame->buf[1], frame->buf[2],
++ av_buffer_get_opaque(frame->buf[0]));
++#endif
++ return rv;
++#endif
++}
++
++int av_rpi_zc_in_use(const struct AVCodecContext * const s)
++{
++ return s->get_buffer2 == zc_get_buffer2;
++}
++
++int av_rpi_zc_init2(struct AVCodecContext * const s,
++ void * pool_env,
++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++ av_rpi_zc_free_pool_fn_t * free_pool_fn)
++{
++ ZcEnv * zc;
++
++ av_assert0(!av_rpi_zc_in_use(s));
++
++ if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL)
++ return AVERROR(ENOMEM);
++
++ zc->old = (ZcOldCtxVals){
++ .opaque = s->opaque,
++ .get_buffer2 = s->get_buffer2,
++ .thread_safe_callbacks = s->thread_safe_callbacks
++ };
++
++ s->opaque = zc;
++ s->get_buffer2 = zc_get_buffer2;
++ s->thread_safe_callbacks = 1;
++ return 0;
++}
++
++void av_rpi_zc_uninit2(struct AVCodecContext * const s)
++{
++ ZcEnv * const zc = s->opaque;
++
++ av_assert0(av_rpi_zc_in_use(s));
++
++ s->get_buffer2 = zc->old.get_buffer2;
++ s->opaque = zc->old.opaque;
++ s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
++
++ av_rpi_zc_env_release(zc);
++}
++
+--- /dev/null
++++ b/libavcodec/rpi_zc.h
+@@ -0,0 +1,228 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef LIBAVCODEC_RPI_ZC_H
++#define LIBAVCODEC_RPI_ZC_H
++
++// Zero-Copy frame code for RPi
++// RPi needs Y/U/V planes to be contiguous for display. By default
++// ffmpeg will allocate separated planes so a memcpy is needed before
++// display. This code provides a method a making ffmpeg allocate a single
++// bit of memory for the frame when can then be reference counted until
++// display has finished with it.
++
++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
++// 0 disables
++// *** This option still in development
++// Only works if SAO active
++// Allocates buffers that are twice the required size
++#define RPI_ZC_SAND_8_IN_10_BUF 0
++
++struct AVBufferRef;
++struct AVFrame;
++struct AVCodecContext;
++enum AVPixelFormat;
++
++// "Opaque" pointer to whatever we are using as a buffer reference
++typedef struct AVBufferRef * AVRpiZcRefPtr;
++
++struct AVZcEnv;
++typedef struct AVZcEnv * AVZcEnvPtr;
++
++typedef struct AVRpiZcFrameGeometry
++{
++ unsigned int stride_y; // Luma stride (bytes)
++ unsigned int height_y; // Luma height (lines)
++ unsigned int stride_c; // Chroma stride (bytes)
++ unsigned int height_c; // Chroma stride (lines)
++ unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1)
++ unsigned int stripes; // Number of stripes (sand)
++ unsigned int bytes_per_pel;
++ int stripe_is_yc; // A single stripe is Y then C (false for tall sand)
++
++ int format; // Requested format
++ unsigned int video_width; // Requested width
++ unsigned int video_height; // Requested height
++} AVRpiZcFrameGeometry;
++
++// Get expected MMAL geometry for a given format, width & height
++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++ const int format,
++ const unsigned int video_width, const unsigned int video_height);
++
++//----------------------------------------------------------------------------
++//
++// Calls that extract info from a ZC frame whether internally or externally
++// allocated
++
++// Generate a ZC reference to the buffer(s) in this frame
++// If the buffer doesn't appear to be one allocated by ZC
++// then the behaviour depends on maycopy:
++// If maycopy=0 then return NULL
++// If maycopy=1 && the src frame is in a form where we can easily copy
++// the data, then allocate a new buffer and copy the data into it
++// Otherwise return NULL
++// If maycopy == 0 then ZC may be NULL
++AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc,
++ const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
++
++// Unreference the buffer refed/allocated by _zc_ref
++// If fr_ref is NULL then this will NOP
++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
++
++// Get the vc_handle from the frame ref
++// Returns -1 if ref doesn't look valid
++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
++// Get the vcsm_handle from the frame ref
++// Returns 0 if ref doesn't look valid
++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref);
++// Get offset from the start of the memory referenced
++// by the vc_handle to valid data
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
++// Length of buffer data
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
++// Get the number of bytes allocated from the frame ref
++// Returns 0 if ref doesn't look valid
++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
++// Geometry this frame was allocated with
++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref);
++
++//----------------------------------------------------------------------------
++//
++// Calls for external frame allocation
++
++// Callbacks registered in av_rpi_zc_init2
++
++// Callback to allocate a buf for a frame
++// The frame itself is generated in the calling code
++//
++// Parameters:
++// pool_env value passed to av-rpi_zc_init2
++// size size wanted
++// geo geometry of the frame to be allocated
++// Returns:
++// NULL Alloc failed
++// ptr AVBufferBuf* of allocated buffer
++// In most cases av_rpi_zc_buf will be called by this function
++// and this will be the buf returned by that.
++typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size,
++ const AVRpiZcFrameGeometry * geo);
++
++// Callback once ffmpeg is completely done with this pool
++// Called once all allocated buffers have been derefed and ffmpegs ref to this
++// pool has been dropped
++typedef void av_rpi_zc_free_pool_fn_t(void * pool_env);
++
++// Init ZC into a context
++// Sets opaque, get_buffer2, thread_safe_callbacks
++// Use if you want to allocate your own pools and/or create ZC buffers for
++// all decoders
++// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken
++// apart by av_rpi_zc_xxx calls without this
++int av_rpi_zc_init2(struct AVCodecContext * const s,
++ void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++ av_rpi_zc_free_pool_fn_t * free_pool_fn);
++
++// Free ZC from a context
++void av_rpi_zc_uninit2(struct AVCodecContext * const s);
++
++// Get minimum pool size in frames - valid by the time the first alloc request
++// occurs. Takes into account thread requests and DPB sizes derived from SPS
++// rather than just adding a worst case DPB size.
++unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc);
++
++typedef struct av_rpi_zc_buf_fn_tab_s {
++ // This AVBuffer is being freed by ffmpeg - return memory
++ // to external pool. Memory may be, but need not be, unmapped.
++ // v is the ptr passed in av_rpi_zc_buf
++ void (* free)(void * v);
++
++ // Return appropriate handles / mappings
++ // v is the ptr passed in av_rpi_zc_buf
++ unsigned int (* vcsm_handle)(void * v);
++ unsigned int (* vc_handle)(void * v);
++ void * (* map_arm)(void * v);
++ unsigned int (* map_vc)(void * v);
++} av_rpi_zc_buf_fn_tab_t;
++
++// Allocate a ZC AVBufferRef and set its callback table
++// Doesn't take a buffer address directly - relies on callbacks to return
++// addresses as they are required. Mappings need not be generated until
++// the map callbacks are called but they should persist from then until
++// the buffer is freed.
++//
++// Parameters:
++// numbytes Size of the buffer
++// addr_offset Offset to first usable byte of buffer (for alignment)
++// normally 0
++// v Pointer passed to callbacks
++// fn_tab Function table
++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab);
++
++// Get v ptr set in in av_rpi_zc_buf
++void * av_rpi_zc_buf_v(AVBufferRef * const buf);
++
++//----------------------------------------------------------------------------
++//
++// Mostly internal calls but might possibly be wanted by outside code
++
++void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc);
++AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx);
++void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size);
++
++// Test to see if the context is using zc (checks get_buffer2)
++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
++
++// Get buffer generates placeholders for later alloc
++int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame);
++// Resolve actually does the alloc (noop if already alloced)
++// Set data pointers on a buffer/frame that was copied before the alloc
++// accured
++#define ZC_RESOLVE_FAIL 0 // return error on invalid
++#define ZC_RESOLVE_ALLOC 1 // alloc as invalid
++#define ZC_RESOLVE_WAIT_VALID 2 // wait for valid
++#define ZC_RESOLVE_ALLOC_VALID 3 // alloc as valid
++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc);
++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc);
++
++int av_rpi_zc_set_valid_frame(AVFrame * const frame);
++int av_rpi_zc_set_broken_frame(AVFrame * const frame);
++
++
++
++
++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
++ void * pool_env,
++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++ av_rpi_zc_free_pool_fn_t * free_pool_fn);
++void av_rpi_zc_env_release(const AVZcEnvPtr zc);
++
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/rpi_zc_frames.h
+@@ -0,0 +1,142 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_ZC_FRAMES_H
++#define RPI_ZC_FRAMES_H
++
++#define RPI_ONE_BUF 1
++
++#include "rpi_mem.h" // for GPU_MEM_PTR_T
++#include "libavutil/frame.h"
++
++#if !RPI_ONE_BUF
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]);
++ return p->vc;
++}
++
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]);
++ return p->vc;
++}
++
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]);
++ return p->vc;
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]);
++}
++
++#else
++
++static inline int gpu_is_buf1(const AVFrame * const frame)
++{
++ return frame->buf[1] == NULL;
++}
++
++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
++{
++ return av_buffer_get_opaque(frame->buf[0]);
++}
++
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
++{
++ return av_buffer_pool_buffer_get_opaque(frame->buf[n]);
++}
++
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++ return gm->vc + (frame->data[n] - gm->arm);
++}
++
++
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++ return get_vc_address3(frame, 0);
++}
++
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++ return get_vc_address3(frame, 1);
++}
++
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++ return get_vc_address3(frame, 2);
++}
++
++#if 0
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++ if (gpu_is_buf1(frame))
++ {
++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++ g.numbytes = frame->data[1] - frame->data[0];
++ return g;
++ }
++ else
++ return *gpu_buf3_gmem(frame, 0);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++ if (gpu_is_buf1(frame))
++ {
++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++ g.arm += frame->data[1] - frame->data[0];
++ g.vc += frame->data[1] - frame->data[0];
++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
++ return g;
++ }
++ else
++ return *gpu_buf3_gmem(frame, 1);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++ if (gpu_is_buf1(frame))
++ {
++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++ g.arm += frame->data[2] - frame->data[0];
++ g.vc += frame->data[2] - frame->data[0];
++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
++ return g;
++ }
++ else
++ return *gpu_buf3_gmem(frame, 2);
++}
++#endif
++#endif
++
++#endif
+--- /dev/null
++++ b/libavcodec/rpivid_hevc.c
+@@ -0,0 +1,2128 @@
++// FFMPEG HEVC decoder hardware accelerator
++// Andrew Holme, Argon Design Ltd
++// Copyright (c) June 2017 Raspberry Pi Ltd
++
++#include <stdio.h>
++#include <fcntl.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <unistd.h>
++#include <sys/mman.h>
++
++#include "fftools/ffmpeg.h"
++#include "libavutil/avassert.h"
++#include "libavutil/imgutils.h"
++#include "avcodec.h"
++#include "hwconfig.h"
++#include "decode.h"
++
++#include "hevc.h"
++#include "hevcdec.h"
++#include "rpi_zc.h"
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++#include "rpi_mailbox.h"
++
++
++#define OPT_PHASE_TIMING 0 // Generate stats for phase usage
++
++#define OPT_EMU 0
++
++#define TRACE_DEV 0
++#define TRACE_ENTRY 0
++
++#define NUM_SCALING_FACTORS 4064
++
++#define AXI_BASE64 0
++
++#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0))
++#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6))
++
++#define RPIVID_COL_PICS 17 // 16 ref & current
++
++#define RPIVID_BITBUFS 2 // Bit + Cmd bufs (phase 0 & 1)
++#define RPIVID_BITBUF_SIZE (4 << 20) // Bit + Cmd buf size
++
++#define RPIVID_COEFFBUFS 3 // PU + Coeff bufs (phase 1 & 2)
++#define RPIVID_COEFFBUF_SIZE (16 << 20) // PU + Coeff buf size
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// Register offsets
++
++#define RPI_SPS0 0
++#define RPI_SPS1 4
++#define RPI_PPS 8
++#define RPI_SLICE 12
++#define RPI_TILESTART 16
++#define RPI_TILEEND 20
++#define RPI_SLICESTART 24
++#define RPI_MODE 28
++#define RPI_LEFT0 32
++#define RPI_LEFT1 36
++#define RPI_LEFT2 40
++#define RPI_LEFT3 44
++#define RPI_QP 48
++#define RPI_CONTROL 52
++#define RPI_STATUS 56
++#define RPI_VERSION 60
++#define RPI_BFBASE 64
++#define RPI_BFNUM 68
++#define RPI_BFCONTROL 72
++#define RPI_BFSTATUS 76
++#define RPI_PUWBASE 80
++#define RPI_PUWSTRIDE 84
++#define RPI_COEFFWBASE 88
++#define RPI_COEFFWSTRIDE 92
++#define RPI_SLICECMDS 96
++#define RPI_BEGINTILEEND 100
++#define RPI_TRANSFER 104
++#define RPI_CFBASE 108
++#define RPI_CFNUM 112
++#define RPI_CFSTATUS 116
++
++#define RPI_PURBASE 0x8000
++#define RPI_PURSTRIDE 0x8004
++#define RPI_COEFFRBASE 0x8008
++#define RPI_COEFFRSTRIDE 0x800C
++#define RPI_NUMROWS 0x8010
++#define RPI_CONFIG2 0x8014
++#define RPI_OUTYBASE 0x8018
++#define RPI_OUTYSTRIDE 0x801C
++#define RPI_OUTCBASE 0x8020
++#define RPI_OUTCSTRIDE 0x8024
++#define RPI_STATUS2 0x8028
++#define RPI_FRAMESIZE 0x802C
++#define RPI_MVBASE 0x8030
++#define RPI_MVSTRIDE 0x8034
++#define RPI_COLBASE 0x8038
++#define RPI_COLSTRIDE 0x803C
++#define RPI_CURRPOC 0x8040
++
++//////////////////////////////////////////////////////////////////////////////
++
++// Unused but left here to illustrate the diffrences between FFmpegs prob
++// structure and the rpivid one
++
++struct FFM_PROB {
++ uint8_t sao_merge_flag [ 1];
++ uint8_t sao_type_idx [ 1];
++ uint8_t split_coding_unit_flag [ 3];
++ uint8_t cu_transquant_bypass_flag [ 1];
++ uint8_t skip_flag [ 3];
++ uint8_t cu_qp_delta [ 3];
++ uint8_t pred_mode_flag [ 1];
++ uint8_t part_mode [ 4];
++ uint8_t prev_intra_luma_pred_flag [ 1];
++ uint8_t intra_chroma_pred_mode [ 2];
++ uint8_t merge_flag [ 1];
++ uint8_t merge_idx [ 1];
++ uint8_t inter_pred_idc [ 5];
++ uint8_t ref_idx_l0 [ 2];
++ uint8_t ref_idx_l1 [ 2];
++ uint8_t abs_mvd_greater0_flag [ 2];
++ uint8_t abs_mvd_greater1_flag [ 2];
++ uint8_t mvp_lx_flag [ 1];
++ uint8_t no_residual_data_flag [ 1];
++ uint8_t split_transform_flag [ 3];
++ uint8_t cbf_luma [ 2];
++ uint8_t cbf_cb_cr [ 4];
++ uint8_t transform_skip_flag/*[][]*/ [ 2];
++ uint8_t explicit_rdpcm_flag/*[][]*/ [ 2];
++ uint8_t explicit_rdpcm_dir_flag/*[][]*/ [ 2];
++ uint8_t last_significant_coeff_x_prefix [18];
++ uint8_t last_significant_coeff_y_prefix [18];
++ uint8_t significant_coeff_group_flag [ 4];
++ uint8_t significant_coeff_flag [44];
++ uint8_t coeff_abs_level_greater1_flag [24];
++ uint8_t coeff_abs_level_greater2_flag [ 6];
++ uint8_t log2_res_scale_abs [ 8];
++ uint8_t res_scale_sign_flag [ 2];
++ uint8_t cu_chroma_qp_offset_flag [ 1];
++ uint8_t cu_chroma_qp_offset_idx [ 1];
++} __attribute__((packed));
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_PROB {
++ uint8_t SAO_MERGE_FLAG [ 1];
++ uint8_t SAO_TYPE_IDX [ 1];
++ uint8_t SPLIT_FLAG [ 3];
++ uint8_t CU_SKIP_FLAG [ 3];
++ uint8_t CU_TRANSQUANT_BYPASS_FLAG [ 1];
++ uint8_t PRED_MODE [ 1];
++ uint8_t PART_SIZE [ 4];
++ uint8_t INTRA_PRED_MODE [ 1];
++ uint8_t CHROMA_PRED_MODE [ 1];
++ uint8_t MERGE_FLAG_EXT [ 1];
++ uint8_t MERGE_IDX_EXT [ 1];
++ uint8_t INTER_DIR [ 5];
++ uint8_t REF_PIC [ 2];
++ uint8_t MVP_IDX [ 1];
++ uint8_t MVD [ 2];
++ uint8_t QT_ROOT_CBF [ 1];
++ uint8_t TRANS_SUBDIV_FLAG [ 3];
++ uint8_t QT_CBF [ 6];
++ uint8_t DQP [ 2];
++ uint8_t ONE_FLAG [24];
++ uint8_t LASTX [18];
++ uint8_t LASTY [18];
++ uint8_t SIG_CG_FLAG [ 4];
++ uint8_t ABS_FLAG [ 6];
++ uint8_t TRANSFORMSKIP_FLAG [ 2];
++ uint8_t SIG_FLAG [42];
++ uint8_t SIG_FLAG_unused [ 2];
++} __attribute__((packed));
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_CMD {
++ uint32_t addr;
++ uint32_t data;
++} __attribute__((packed));
++
++struct RPI_BIT {
++ int cmd;
++ const void *ptr;
++ int len;
++};
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_T;
++
++// Actual addressability is 38bits but we can only alloc in the bottom 32
++// currently - when passed to rpivid h/w the address is always >> 6 so will
++// fit in 32 bit there
++// At some point we may weant to make this uint64_t
++typedef uint32_t vid_vc_addr_t;
++
++typedef enum rpivid_decode_state_e {
++ RPIVID_DECODE_NEW = 0,
++ RPIVID_DECODE_START,
++ RPIVID_DECODE_SLICE,
++ RPIVID_DECODE_END,
++} rpivid_decode_state_t;
++
++#define RPI_PROB_VALS 154U
++#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
++
++typedef struct dec_env_s {
++ const AVCodecContext * avctx;
++
++ rpivid_decode_state_t state;
++ unsigned int decode_order;
++
++ int phase_no; // Current phase (i.e. the last one we waited for)
++ struct dec_env_s * phase_wait_q_next;
++ sem_t phase_wait;
++
++ struct RPI_BIT *bit_fifo;
++ struct RPI_CMD *cmd_fifo;
++ unsigned int bit_len, bit_max;
++ unsigned int cmd_len, cmd_max;
++ unsigned int num_slice_msgs;
++ unsigned int PicWidthInCtbsY;
++ unsigned int PicHeightInCtbsY;
++ unsigned int dpbno_col;
++ uint32_t reg_slicestart;
++ unsigned int wpp_entry_x;
++ unsigned int wpp_entry_y;
++
++ const uint8_t * nal_buffer;
++ size_t nal_size;
++
++ uint16_t slice_msgs[2*HEVC_MAX_REFS*8+3];
++ uint8_t scaling_factors[NUM_SCALING_FACTORS];
++// unsigned int RefPicList[2][HEVC_MAX_REFS];
++} dec_env_t;
++
++#define RPIVID_PHASES 3
++#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order
++#define RPIVID_PHASE_START (-1) // Phase after we have inced decode_order
++
++#if OPT_PHASE_TIMING
++static const unsigned int time_thresholds[8] = {
++ 10, 15, 20, 30, 45, 60, 75, 90
++};
++#endif
++
++typedef struct phase_wait_env_s {
++ unsigned int last_order;
++ dec_env_t * q;
++#if OPT_PHASE_TIMING
++ uint64_t phase_time;
++ uint64_t max_phase_time;
++ uint64_t time_in_phase;
++ uint64_t time_out_phase;
++ unsigned int max_time_decode_order;
++ unsigned int time_bins[9];
++ unsigned int time_bins3[9];
++ unsigned int time_bins5[9];
++ uint64_t time_stash[16];
++ unsigned int i3;
++#endif
++} phase_wait_env_t; // Single linked list of threads waiting for this phase
++
++typedef struct RPI_T {
++ atomic_int ref_count;
++ sem_t ref_zero;
++
++ dec_env_t ** dec_envs;
++ AVZcEnvPtr zc;
++
++ pthread_mutex_t phase_lock;
++ phase_wait_env_t phase_reqs[RPIVID_PHASES];
++
++ volatile uint32_t * regs;
++ volatile uint32_t * ints;
++
++ GPU_MEM_PTR_T gcolbuf;
++ unsigned int col_stride;
++ size_t col_picsize;
++
++ unsigned int bitbuf_no;
++ sem_t bitbuf_sem;
++ GPU_MEM_PTR_T gbitbufs[RPIVID_BITBUFS];
++
++ unsigned int max_pu_msgs;
++ unsigned int coeffbuf_no;
++ sem_t coeffbuf_sem;
++ GPU_MEM_PTR_T gcoeffbufs[RPIVID_COEFFBUFS];
++
++ unsigned int decode_order;
++ int mbox_fd;
++ int gpu_init_type;
++} RPI_T;
++
++#if OPT_PHASE_TIMING
++static uint64_t tus64(void)
++{
++ struct timespec ts;
++ clock_gettime(CLOCK_MONOTONIC, &ts);
++ return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
++}
++#endif
++
++static inline unsigned int rnd64(unsigned int x)
++{
++ return (x + 63) & ~63;
++}
++
++static inline int rpi_sem_wait(sem_t * const sem)
++{
++ int rv;
++ while ((rv = sem_wait(sem)) != 0 && errno == EINTR)
++ /* Loop */;
++ return rv;
++}
++
++//============================================================================
++
++#define REGS_NAME "/dev/rpivid-hevcmem"
++#define REGS_SIZE 0x10000
++#define INTS_NAME "/dev/rpivid-intcmem"
++#define INTS_SIZE 0x10000 // 4 is probably enough but we are going to alloc a page anyway
++
++static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size)
++{
++ void *gpio_map;
++ int mem_fd;
++
++ /* open /dev/mem */
++ if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) {
++ av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name);
++ return NULL;
++ }
++
++ // Now map it
++ gpio_map = mmap(
++ NULL,
++ size,
++ PROT_READ|PROT_WRITE,
++ MAP_SHARED,
++ mem_fd,
++ 0
++ );
++
++ close(mem_fd); // No longer need the FD
++
++ if (gpio_map == MAP_FAILED) {
++ av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed");
++ return NULL;
++ }
++
++ return (volatile uint32_t *)gpio_map;
++}
++
++static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size)
++{
++ volatile uint32_t * const gpio_map = *p_gpio_map;
++ if (gpio_map != NULL) {
++ *p_gpio_map = NULL;
++ munmap((void *)gpio_map, size);
++ }
++}
++
++#define MANGLE(x) ((x) &~0xc0000000) // ** If x is ever a 64 bit thing this will need fixing!
++#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6)
++
++static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data)
++{
++#if TRACE_DEV
++ printf("W %x %08x\n", addr, MANGLE64(data));
++#endif
++
++ rpi->regs[addr >> 2] = MANGLE64(data);
++}
++
++static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data)
++{
++#if TRACE_DEV
++ printf("W %x %08x\n", addr, data >> 6);
++#endif
++
++ rpi->regs[addr >> 2] = data >> 6; // ?? rnd64 - but not currently needed
++}
++
++static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data)
++{
++#if TRACE_DEV
++ printf("W %x %08x\n", addr, data);
++#endif
++
++ rpi->regs[addr >> 2] = data;
++}
++
++static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr)
++{
++ const uint32_t v = rpi->regs[addr >> 2];
++#if TRACE_DEV
++ printf("R %x (=%x)\n", addr, v);
++#endif
++ return v;
++}
++
++#define ARG_IC_ICTRL_ACTIVE1_INT_SET 0x00000001
++#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET 0x00000002
++#define ARG_IC_ICTRL_ACTIVE1_EN_SET 0x00000004
++#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET 0x00000008
++#define ARG_IC_ICTRL_ACTIVE2_INT_SET 0x00000010
++#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET 0x00000020
++#define ARG_IC_ICTRL_ACTIVE2_EN_SET 0x00000040
++#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET 0x00000080
++
++static inline void int_wait(const RPI_T * const rpi, const unsigned int phase)
++{
++ const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET;
++ const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET;
++ uint32_t ival;
++ while (((ival = rpi->ints[0]) & mask_done) == 0) {
++ usleep(1000);
++ }
++ rpi->ints[0] = ival & mask_reset;
++}
++
++#if TRACE_DEV && 0
++static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) {
++ int i;
++
++ for (i=0; i<num; i++)
++ {
++ if ((i%4)==0)
++ printf("%08x: ", 0x7eb00000 + addr + 4*i);
++
++ printf("%08x", rpi->regs[(addr>>2)+i]);
++
++ if ((i%4)==3 || i+1 == num)
++ printf("\n");
++ else
++ printf(" ");
++ }
++}
++
++static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) {
++ int i;
++
++ for (i=0; i<size>>2; i++)
++ {
++ if ((i%4)==0)
++ printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i);
++
++ printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]);
++
++ if ((i%4)==3 || i+1 == size>>2)
++ printf("\n");
++ else
++ printf(" ");
++ }
++}
++#endif
++
++//////////////////////////////////////////////////////////////////////////////
++
++static inline size_t round_up_size(const size_t x)
++{
++ /* Admit no size < 256 */
++ const unsigned int n = x < 256 ? 8 : av_log2(x) - 1;
++
++ return x >= (3 << n) ? 4 << n : (3 << n);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Scaling factors
++
++static void expand_scaling_list(
++ const unsigned int sizeID,
++ const unsigned int matrixID,
++ uint8_t * const dst0,
++ const uint8_t * const src0,
++ uint8_t dc)
++{
++ switch (sizeID) {
++ case 0:
++ memcpy(dst0, src0, 16);
++ break;
++ case 1:
++ memcpy(dst0, src0, 64);
++ break;
++ case 2:
++ {
++ uint8_t * d = dst0;
++ for (unsigned int y=0; y != 16; y++) {
++ const uint8_t * s = src0 + (y >> 1) * 8;
++ for (unsigned int x = 0; x != 8; ++x) {
++ *d++ = *s;
++ *d++ = *s++;
++ }
++ }
++ dst0[0] = dc;
++ break;
++ }
++ default:
++ {
++ uint8_t * d = dst0;
++ for (unsigned int y=0; y != 32; y++) {
++ const uint8_t * s = src0 + (y >> 2) * 8;
++ for (unsigned int x = 0; x != 8; ++x) {
++ *d++ = *s;
++ *d++ = *s;
++ *d++ = *s;
++ *d++ = *s++;
++ }
++ }
++ dst0[0] = dc;
++ break;
++ }
++ }
++}
++
++static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) {
++ // Array of constants for scaling factors
++ static const uint32_t scaling_factor_offsets[4][6] = {
++ // MID0 MID1 MID2 MID3 MID4 MID5
++ {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050}, // SID0 (4x4)
++ {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0}, // SID1 (8x8)
++ {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0}, // SID2 (16x16)
++ {0x07E0, 0, 0, 0x0BE0, 0, 0}}; // SID3 (32x32)
++
++ // ffmpeg places SID3,MID1 where matrixID 3 normally is
++ const ScalingList * const sl =
++ s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list
++ : &s->ps.sps->scaling_list;
++ unsigned int mid;
++
++ for (mid=0; mid<6; mid++)
++ expand_scaling_list(0, mid,
++ de->scaling_factors + scaling_factor_offsets[0][mid],
++ sl->sl[0][mid], 0);
++ for (mid=0; mid<6; mid++)
++ expand_scaling_list(1, mid,
++ de->scaling_factors + scaling_factor_offsets[1][mid],
++ sl->sl[1][mid], 0);
++ for (mid=0; mid<6; mid++)
++ expand_scaling_list(2, mid,
++ de->scaling_factors + scaling_factor_offsets[2][mid],
++ sl->sl[2][mid],
++ sl->sl_dc[0][mid]);
++ // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg
++ for (mid=0; mid<6; mid += 3)
++ expand_scaling_list(3, mid,
++ de->scaling_factors + scaling_factor_offsets[3][mid],
++ sl->sl[3][mid],
++ sl->sl_dc[1][mid]);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Probabilities
++
++static const uint8_t prob_init[3][156] = {
++ {
++ 153, 200, 139, 141, 157, 154, 154, 154,
++ 154, 154, 184, 154, 154, 154, 184, 63,
++ 154, 154, 154, 154, 154, 154, 154, 154,
++ 154, 154, 154, 154, 154, 153, 138, 138,
++ 111, 141, 94, 138, 182, 154, 154, 154,
++ 140, 92, 137, 138, 140, 152, 138, 139,
++ 153, 74, 149, 92, 139, 107, 122, 152,
++ 140, 179, 166, 182, 140, 227, 122, 197,
++ 110, 110, 124, 125, 140, 153, 125, 127,
++ 140, 109, 111, 143, 127, 111, 79, 108,
++ 123, 63, 110, 110, 124, 125, 140, 153,
++ 125, 127, 140, 109, 111, 143, 127, 111,
++ 79, 108, 123, 63, 91, 171, 134, 141,
++ 138, 153, 136, 167, 152, 152, 139, 139,
++ 111, 111, 125, 110, 110, 94, 124, 108,
++ 124, 107, 125, 141, 179, 153, 125, 107,
++ 125, 141, 179, 153, 125, 107, 125, 141,
++ 179, 153, 125, 140, 139, 182, 182, 152,
++ 136, 152, 136, 153, 136, 139, 111, 136,
++ 139, 111, 0, 0, },
++ {
++ 153, 185, 107, 139, 126, 197, 185, 201,
++ 154, 149, 154, 139, 154, 154, 154, 152,
++ 110, 122, 95, 79, 63, 31, 31, 153,
++ 153, 168, 140, 198, 79, 124, 138, 94,
++ 153, 111, 149, 107, 167, 154, 154, 154,
++ 154, 196, 196, 167, 154, 152, 167, 182,
++ 182, 134, 149, 136, 153, 121, 136, 137,
++ 169, 194, 166, 167, 154, 167, 137, 182,
++ 125, 110, 94, 110, 95, 79, 125, 111,
++ 110, 78, 110, 111, 111, 95, 94, 108,
++ 123, 108, 125, 110, 94, 110, 95, 79,
++ 125, 111, 110, 78, 110, 111, 111, 95,
++ 94, 108, 123, 108, 121, 140, 61, 154,
++ 107, 167, 91, 122, 107, 167, 139, 139,
++ 155, 154, 139, 153, 139, 123, 123, 63,
++ 153, 166, 183, 140, 136, 153, 154, 166,
++ 183, 140, 136, 153, 154, 166, 183, 140,
++ 136, 153, 154, 170, 153, 123, 123, 107,
++ 121, 107, 121, 167, 151, 183, 140, 151,
++ 183, 140, 0, 0, },
++ {
++ 153, 160, 107, 139, 126, 197, 185, 201,
++ 154, 134, 154, 139, 154, 154, 183, 152,
++ 154, 137, 95, 79, 63, 31, 31, 153,
++ 153, 168, 169, 198, 79, 224, 167, 122,
++ 153, 111, 149, 92, 167, 154, 154, 154,
++ 154, 196, 167, 167, 154, 152, 167, 182,
++ 182, 134, 149, 136, 153, 121, 136, 122,
++ 169, 208, 166, 167, 154, 152, 167, 182,
++ 125, 110, 124, 110, 95, 94, 125, 111,
++ 111, 79, 125, 126, 111, 111, 79, 108,
++ 123, 93, 125, 110, 124, 110, 95, 94,
++ 125, 111, 111, 79, 125, 126, 111, 111,
++ 79, 108, 123, 93, 121, 140, 61, 154,
++ 107, 167, 91, 107, 107, 167, 139, 139,
++ 170, 154, 139, 153, 139, 123, 123, 63,
++ 124, 166, 183, 140, 136, 153, 154, 166,
++ 183, 140, 136, 153, 154, 166, 183, 140,
++ 136, 153, 154, 170, 153, 138, 138, 122,
++ 121, 122, 121, 167, 151, 183, 140, 151,
++ 183, 140, 0, 0, },
++};
++
++
++//////////////////////////////////////////////////////////////////////////////
++// Phase 1 command and bit FIFOs
++
++// ???? uint16_t addr - put in uint32_t
++static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) {
++ if (de->cmd_len==de->cmd_max)
++ av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD)));
++
++#if TRACE_DEV
++ printf("[%02x] %x %x\n", de->cmd_len, addr, data);
++#endif
++
++ de->cmd_fifo[de->cmd_len].addr = addr;
++ de->cmd_fifo[de->cmd_len].data = data;
++ return de->cmd_len++;
++}
++
++static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) {
++ if (de->bit_len==de->bit_max)
++ av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT)));
++ de->bit_fifo[de->bit_len].cmd = cmd_idx;
++ de->bit_fifo[de->bit_len].ptr = ptr;
++ de->bit_fifo[de->bit_len].len = len;
++ de->bit_len++;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Write probability and scaling factor memories
++
++#if 0
++static void WriteProb(dec_env_t * const de) {
++ int i;
++ const uint8_t *p = (uint8_t *) &de->probabilities;
++ for (i=0; i<sizeof(struct RPI_PROB); i+=4, p+=4)
++ p1_apb_write(de, 0x1000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
++}
++#endif
++
++static void WriteProb(dec_env_t * const de, const HEVCContext * const s) {
++ uint8_t dst[RPI_PROB_ARRAY_SIZE];
++
++ const unsigned int init_type = (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ?
++ s->sh.slice_type + 1 : 2 - s->sh.slice_type;
++ const uint8_t * p = prob_init[init_type];
++ const int q = av_clip(s->sh.slice_qp, 0, 51);
++ unsigned int i;
++
++ for (i = 0; i < RPI_PROB_VALS; i++) {
++ int init_value = p[i];
++ int m = (init_value >> 4) * 5 - 45;
++ int n = ((init_value & 15) << 3) - 16;
++ int pre = 2 * (((m * q) >> 4) + n) - 127;
++
++ pre ^= pre >> 31;
++ if (pre > 124)
++ pre = 124 + (pre & 1);
++ dst[i] = pre;
++ }
++ for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) {
++ dst[i] = 0;
++ }
++
++ for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4)
++ p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24));
++
++}
++
++
++static void WriteScalingFactors(dec_env_t * const de) {
++ int i;
++ const uint8_t *p = (uint8_t *) de->scaling_factors;
++ for (i=0; i<NUM_SCALING_FACTORS; i+=4, p+=4)
++ p1_apb_write(de, 0x2000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int ctb_to_tile (unsigned int ctb, unsigned int *bd, int num) {
++ int i;
++ for (i=1; ctb >= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c
++ return i-1;
++}
++
++static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) {
++ if (ctb < bd[num-1]) return ctb_size;
++ else if (width % ctb_size) return width % ctb_size;
++ else return ctb_size;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Handle PU and COEFF stream overflow
++
++
++// Returns:
++// -2 Other error
++// -1 Out of coeff space
++// 0 OK
++// 1 Out of PU space
++
++static int check_status(const RPI_T * const rpi, dec_env_t * const de) {
++ uint32_t status;
++
++ // this is the definition of successful completion of phase 1
++ // it assures that status register is zero and all blocks in each tile have completed
++ if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM))
++ return 0;
++
++ status = apb_read(rpi, RPI_STATUS);
++
++ if ((status & 8) != 0)
++ return -1;
++
++ if ((status & 0x10) != 0)
++ return 1;
++
++ return -2;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Write STATUS register with expected end CTU address of previous slice
++
++static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) {
++ const HEVCPPS * const pps = s->ps.pps;
++ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
++ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++}
++
++static void wpp_pause(dec_env_t * const de, int ctb_row) {
++ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25);
++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000);
++ p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2);
++}
++
++static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
++ const HEVCPPS *pps = s->ps.pps;
++ int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++ int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
++ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
++ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
++ if (de->wpp_entry_x<2 && (de->wpp_entry_y<new_y || new_x>2) && de->PicWidthInCtbsY>2)
++ wpp_pause(de, last_y);
++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++ if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_y<new_y)
++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void new_slice_segment(dec_env_t * const de, const HEVCContext * const s)
++{
++ const HEVCSPS *sps = s->ps.sps;
++ const HEVCPPS *pps = s->ps.pps;
++
++ p1_apb_write(de, RPI_SPS0,
++ (sps->log2_min_cb_size << 0) +
++ (sps->log2_ctb_size << 4) +
++ (sps->log2_min_tb_size << 8) +
++ (sps->log2_max_trafo_size << 12) +
++ (sps->bit_depth << 16) +
++ (sps->bit_depth << 20) +
++ (sps->max_transform_hierarchy_depth_intra << 24) +
++ (sps->max_transform_hierarchy_depth_inter << 28));
++
++ p1_apb_write(de, RPI_SPS1,
++ (sps->pcm.bit_depth << 0) +
++ (sps->pcm.bit_depth_chroma << 4) +
++ (sps->pcm.log2_min_pcm_cb_size << 8) +
++ (sps->pcm.log2_max_pcm_cb_size << 12) +
++ (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) +
++ (sps->amp_enabled_flag << 18) +
++ (sps->pcm_enabled_flag << 19) +
++ (sps->scaling_list_enable_flag << 20) +
++ (sps->sps_strong_intra_smoothing_enable_flag << 21));
++
++ p1_apb_write(de, RPI_PPS,
++ (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth << 0) +
++ (pps->cu_qp_delta_enabled_flag << 4) +
++ (pps->transquant_bypass_enable_flag << 5) +
++ (pps->transform_skip_enabled_flag << 6) +
++ (pps->sign_data_hiding_flag << 7) +
++ (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) << 8) +
++ (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) +
++ (pps->constrained_intra_pred_flag << 24));
++
++ if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de);
++
++ if (!s->sh.dependent_slice_segment_flag) {
++ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++ int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
++ de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16);
++ }
++
++ p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void write_slice(dec_env_t * const de, const HEVCContext * const s,
++ const unsigned int slice_w, const unsigned int slice_h) {
++ uint32_t u32 =
++ (s->sh.slice_type << 12)
++ + (s->sh.slice_sample_adaptive_offset_flag[0] << 14)
++ + (s->sh.slice_sample_adaptive_offset_flag[1] << 15)
++ + (slice_w << 17)
++ + (slice_h << 24);
++
++ if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |=
++ (s->sh.max_num_merge_cand << 0)
++ + (s->sh.nb_refs[L0] << 4)
++ + (s->sh.nb_refs[L1] << 8);
++
++ if (s->sh.slice_type==HEVC_SLICE_B)
++ u32 |= s->sh.mvd_l1_zero_flag<<16;
++ p1_apb_write(de, RPI_SLICE, u32);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Wavefront mode
++
++static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s,
++ const int do_bte, const int resetQPY, const int ctb_addr_ts) {
++ const HEVCSPS * const sps = s->ps.sps;
++ const HEVCPPS * const pps = s->ps.pps;
++
++ int ctb_size = 1<<sps->log2_ctb_size;
++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++
++ int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY;
++ int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY;
++
++ int endx = de->PicWidthInCtbsY-1;
++ int endy = ctb_row;
++
++ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width, pps->col_bd, pps->num_tile_columns);
++ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
++
++ p1_apb_write(de, RPI_TILESTART, 0);
++ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
++
++ if (do_bte)
++ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
++
++ write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size);
++
++ if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
++
++ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001);
++ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Tiles mode
++
++static void new_entry_point(dec_env_t * const de, const HEVCContext * const s,
++ const int do_bte, const int resetQPY, const int ctb_addr_ts) {
++ const HEVCSPS * const sps = s->ps.sps;
++ const HEVCPPS * const pps = s->ps.pps;
++
++ int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY;
++ int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY;
++
++ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
++ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
++
++ int endx = pps->col_bd[tile_x+1] - 1;
++ int endy = pps->row_bd[tile_y+1] - 1;
++
++ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<<sps->log2_ctb_size, sps->width, pps->col_bd, pps->num_tile_columns);
++ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<<sps->log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
++
++ p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16));
++ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
++
++ if (do_bte)
++ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
++
++ write_slice(de, s, slice_w, slice_h);
++
++ if (resetQPY)
++ p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
++
++ p1_apb_write(de, RPI_MODE, (0xFFFF << 0)
++ + (0x0 << 16)
++ + ((tile_x==pps->num_tile_columns-1) << 17)
++ + ((tile_y==pps->num_tile_rows-1) << 18));
++
++ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++// Doesn't attempt to remove from context as we should only do this at the end
++// of time or on create error
++static void
++dec_env_delete(dec_env_t * const de)
++{
++// gpu_free(&de->gbuf);
++
++ av_freep(&de->cmd_fifo);
++ av_freep(&de->bit_fifo);
++
++ sem_destroy(&de->phase_wait);
++ av_free(de);
++}
++
++static dec_env_t *
++dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi)
++{
++ dec_env_t * const de = av_mallocz(sizeof(*de));
++ int i;
++
++ if (de == NULL)
++ return NULL;
++
++ de->avctx = avctx;
++ de->phase_no = RPIVID_PHASE_NEW;
++
++ sem_init(&de->phase_wait, 0, 0);
++
++ if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL)
++ goto fail;
++
++ if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL)
++ goto fail;
++
++ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
++ for (i = 0; i != avctx->thread_count; ++i) {
++ if (rpi->dec_envs[i] == NULL)
++ {
++ rpi->dec_envs[i] = de;
++ break;
++ }
++ }
++ pthread_mutex_unlock(&rpi->phase_lock);
++
++ if (i == avctx->thread_count) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n");
++ goto fail;
++ }
++
++ return de;
++
++fail:
++ dec_env_delete(de);
++ return NULL;
++}
++
++
++static dec_env_t *
++dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi)
++{
++ dec_env_t * de = NULL;
++ const int ref_count = atomic_fetch_add(&rpi->ref_count, 1);
++
++ if (ref_count <= 0) {
++ // Already dead
++ av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");;
++ return NULL;
++ }
++
++ for (int i = 0; i != avctx->thread_count; ++i) {
++ if (rpi->dec_envs[i] == NULL)
++ {
++ de = dec_env_new(avctx, rpi);
++ break;
++ }
++ if (rpi->dec_envs[i]->avctx == avctx)
++ {
++ de = rpi->dec_envs[i];
++ break;
++ }
++ }
++ return de;
++}
++
++// Call at end of fn
++// Used to ensure we aren't in a worker thead when killed
++static void
++dec_env_release(RPI_T * const rpi, dec_env_t * const de)
++{
++ const int n = atomic_fetch_sub(&rpi->ref_count, 1);
++ if (n == 1) {
++ sem_post(&rpi->ref_zero);
++ }
++}
++
++//----------------------------------------------------------------------------
++
++// Wait for a slot in the given phase
++// Any error return is probably fatal
++static int
++wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
++{
++ int needs_wait = 0;
++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++
++ pthread_mutex_lock(&rpi->phase_lock);
++ if (p->last_order + 1 != de->decode_order) {
++ de->phase_wait_q_next = p->q;
++ p->q = de;
++ needs_wait = 1;
++ }
++ pthread_mutex_unlock(&rpi->phase_lock);
++
++ if (needs_wait) {
++ while (sem_wait(&de->phase_wait) == -1)
++ {
++ int err;
++ if ((err = errno) != EINTR)
++ return AVERROR(err);
++ }
++ }
++
++ de->phase_no = phase_no;
++ return 0;
++}
++
++static void
++post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
++{
++ dec_env_t * next_de = NULL;
++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++ dec_env_t ** q = &p->q;
++
++ pthread_mutex_lock(&rpi->phase_lock);
++
++ p->last_order = de->decode_order;
++ while (*q != NULL) {
++ dec_env_t * const t_de = *q;
++
++ if (t_de->decode_order == p->last_order + 1) {
++ // This is us - remove from Q
++ *q = t_de->phase_wait_q_next;
++ t_de->phase_wait_q_next = NULL; // Tidy
++ next_de = t_de;
++ break;
++ }
++ q = &t_de->phase_wait_q_next;
++ }
++
++ pthread_mutex_unlock(&rpi->phase_lock);
++
++ if (next_de != NULL)
++ sem_post(&next_de->phase_wait);
++}
++
++// Wait & signal stuff s.t. threads in other phases can continue
++static void
++abort_phases(RPI_T * const rpi, dec_env_t * const de)
++{
++ for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) {
++ wait_phase(rpi, de, i);
++ post_phase(rpi, de, i);
++ }
++ de->phase_no = RPIVID_PHASE_NEW;
++}
++
++// Start timing for phase
++// Stats only - no actual effect
++static inline void tstart_phase(RPI_T * const rpi, const int phase_no)
++{
++#if OPT_PHASE_TIMING
++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++ const int64_t now = tus64();
++ if (p->phase_time != 0)
++ p->time_out_phase += now - p->phase_time;
++ p->phase_time = now;
++#endif
++}
++
++#if OPT_PHASE_TIMING
++static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n)
++{
++ uint64_t tsum = 0;
++ unsigned int i;
++ for (i = 0; i != avg_n; ++i)
++ tsum += p->time_stash[(p->i3 - i) & 15];
++ for (i = 0; i != 9; ++i) {
++ if (time_thresholds[i] * 1000 * avg_n > tsum)
++ break;
++ }
++ return i;
++}
++#endif
++
++// End timing for phase
++// Stats only - no actual effect
++static inline void tend_phase(RPI_T * const rpi, const int phase_no)
++{
++#if OPT_PHASE_TIMING
++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++ const uint64_t now = tus64();
++ const uint64_t in_time = now - p->phase_time;
++
++ p->time_in_phase += in_time;
++ p->phase_time = now;
++ p->time_stash[p->i3] = in_time;
++ if (in_time > p->max_phase_time) {
++ p->max_phase_time = in_time;
++ p->max_time_decode_order = p->last_order;
++ }
++ ++p->time_bins[tavg_bin_phase(p, 1)];
++ ++p->time_bins3[tavg_bin_phase(p, 3)];
++ ++p->time_bins5[tavg_bin_phase(p, 5)];
++
++ p->i3 = (p->i3 + 1) & 15;
++#endif
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Start frame
++
++static int rpi_hevc_start_frame(
++ AVCodecContext * avctx,
++ const uint8_t *buffer,
++ uint32_t size) {
++
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ dec_env_t * const de = dec_env_get(avctx, rpi);
++ const HEVCContext * const s = avctx->priv_data;
++ const HEVCSPS * const sps = s->ps.sps;
++ const unsigned int CtbSizeY = 1U << sps->log2_ctb_size;
++
++#if TRACE_ENTRY
++ printf("<<< %s[%p]\n", __func__, de);
++#endif
++
++ if (de == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++ return -1;
++ }
++
++ de->phase_no = RPIVID_PHASE_START;
++ de->decode_order = ++rpi->decode_order; // *** atomic?
++
++ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
++
++ if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
++ return -1;
++ }
++ de->state = RPIVID_DECODE_START;
++
++ de->PicWidthInCtbsY = (sps->width + CtbSizeY - 1) / CtbSizeY; //7-15
++ de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY; //7-17
++ de->bit_len = 0;
++ de->cmd_len = 0;
++
++#if TRACE_ENTRY
++ printf(">>> %s[%p]\n", __func__, de);
++#endif
++
++ dec_env_release(rpi, de);
++ return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Slice messages
++
++static void msg_slice(dec_env_t * const de, const uint16_t msg) {
++ de->slice_msgs[de->num_slice_msgs++] = msg;
++}
++
++static void program_slicecmds(dec_env_t * const de, const int sliceid) {
++ int i;
++ p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8));
++ for(i=0; i < de->num_slice_msgs; i++) {
++ p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff);
++ }
++}
++
++static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) {
++ const HEVCSPS * const sps = s->ps.sps;
++ const HEVCPPS * const pps = s->ps.pps;
++ const SliceHeader *sh = &s->sh;
++
++ int weightedPredFlag, i, rIdx;
++ uint16_t cmd_slice;
++ unsigned int collocated_from_l0_flag;
++
++ de->num_slice_msgs=0;
++ de->dpbno_col = 0;
++ cmd_slice = 0;
++ if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1;
++ if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2;
++ if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3;
++
++ if (sh->slice_type!=HEVC_SLICE_I) {
++ cmd_slice += sh->nb_refs[L0]<<2;
++ cmd_slice += sh->nb_refs[L1]<<6;
++ }
++
++ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B)
++ cmd_slice |= sh->max_num_merge_cand<<11;
++
++ collocated_from_l0_flag =
++ !sh->slice_temporal_mvp_enabled_flag ?
++ 0 :
++ sh->slice_type == HEVC_SLICE_B ?
++ (sh->collocated_list == L0) :
++ (sh->slice_type==HEVC_SLICE_P);
++ cmd_slice |= collocated_from_l0_flag<<14;
++
++ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) {
++
++ int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past
++ for(i=L0; i<=L1; i++) {
++ for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
++ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
++ HEVCFrame *c = s->ref; // CurrentPicture
++ if (c->poc < f->poc) NoBackwardPredFlag = 0;
++ }
++ }
++
++ if (sps->sps_temporal_mvp_enabled_flag)
++ {
++ const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ?
++ s->ref->refPicList + 0 :
++ s->ref->refPicList + 1;
++ de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB;
++ }
++
++ cmd_slice += NoBackwardPredFlag<<10;
++ msg_slice(de, cmd_slice);
++
++ // Write reference picture descriptions
++ weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag;
++
++ for(i=L0; i<=L1; i++)
++ for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
++ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
++ HEVCFrame *c = s->ref; // CurrentPicture
++ int pic = f - s->DPB;
++ // Make sure pictures are in range 0 to 15
++ int adjusted_pic = f<c? pic : pic-1;
++ int lt = s->ref->refPicList[i].isLongTerm[rIdx];
++ msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6));
++ msg_slice(de, f->poc);
++ if (weightedPredFlag) {
++ msg_slice(de, s->sh.luma_log2_weight_denom+(((i?s-> sh.luma_weight_l1: s->sh.luma_weight_l0)[rIdx] &0x1ff)<<3));
++ msg_slice(de, (i?s-> sh.luma_offset_l1: s->sh.luma_offset_l0)[rIdx] & 0xff);
++ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3));
++ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff);
++ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3));
++ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff);
++ }
++ }
++ }
++ else
++ msg_slice(de, cmd_slice);
++
++ msg_slice(de, ((sh->beta_offset/2)&15)
++ + (((sh->tc_offset/2)&15) << 4)
++ + (sh->disable_deblocking_filter_flag << 8)
++ + (sh->slice_loop_filter_across_slices_enabled_flag << 9)
++ + (pps->loop_filter_across_tiles_enabled_flag << 10)); // CMD_DEBLOCK
++
++ msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF
++}
++
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void rpi_hevc_abort_frame(AVCodecContext * const avctx) {
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ dec_env_t * const de = dec_env_get(avctx, rpi);
++
++#if TRACE_ENTRY
++ printf("<<< %s[%p]\n", __func__, de);
++#endif
++
++ if (de == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++ return;
++ }
++
++ switch (de->state) {
++ case RPIVID_DECODE_NEW:
++ case RPIVID_DECODE_END:
++ // Expected transition
++ break;
++
++ case RPIVID_DECODE_SLICE:
++ // Error transition
++ av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n");
++ break;
++
++ case RPIVID_DECODE_START:
++ default:
++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
++ break;
++ }
++
++ abort_phases(rpi, de);
++ de->state = RPIVID_DECODE_NEW;
++
++ dec_env_release(rpi, de);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// End frame
++
++static int rpi_hevc_end_frame(AVCodecContext * const avctx) {
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ const HEVCContext * const s = avctx->priv_data;
++ const HEVCPPS * const pps = s->ps.pps;
++ const HEVCSPS * const sps = s->ps.sps;
++ dec_env_t * const de = dec_env_get(avctx, rpi);
++ AVFrame * const f = s->ref->frame;
++ const unsigned int dpbno_cur = s->ref - s->DPB;
++ vid_vc_addr_t cmds_vc;
++ vid_vc_addr_t pu_base_vc;
++ unsigned int pu_stride;
++ vid_vc_addr_t coeff_base_vc;
++ unsigned int coeff_stride;
++ unsigned int i;
++ int rv = 0;
++ int status = 0;
++ int coeffbuf_sem_claimed = 0;
++
++#if TRACE_ENTRY
++ fprintf("<<< %s[%p]\n", __func__, de);
++#endif
++
++ if (de == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++ return AVERROR_BUG; // Should never happen
++ }
++
++ if (de->state != RPIVID_DECODE_SLICE) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
++ rv = AVERROR_UNKNOWN;
++ goto fail;
++ }
++ de->state = RPIVID_DECODE_END;
++
++ // End of command compilation
++ {
++ const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1;
++ const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1;
++ if (pps->entropy_coding_sync_enabled_flag) {
++ if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2)
++ wpp_pause(de, last_y);
++ }
++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++ }
++
++ // Phase 0 ---------------------------------------------------------------
++
++ wait_phase(rpi, de, 0);
++ rpi_sem_wait(&rpi->bitbuf_sem);
++ tstart_phase(rpi, 0);
++
++ // Copy cmds & bits into gpu side buffer
++ // Layout: CMDS, BITS
++ {
++ uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm;
++ vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc;
++ unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD);
++
++ uint8_t * p = armbase + rnd64(cmd_bytes);
++ uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes;
++
++ cmds_vc = vcbase;
++
++ // Copy all the bits & update bitstream cmds to point at the right bits
++ for (i = 0; i < de->bit_len; ++i)
++ {
++ const unsigned int seg_len = de->bit_fifo[i].len;
++
++ if (p + seg_len > eobits) {
++ status = -1;
++ break;
++ }
++
++ memcpy(p, de->bit_fifo[i].ptr, seg_len);
++ de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase);
++
++ p += rnd64(seg_len);
++ }
++
++ memcpy(armbase, de->cmd_fifo, cmd_bytes);
++ }
++
++ if (status == 0)
++ {
++ if (++rpi->bitbuf_no >= RPIVID_BITBUFS)
++ rpi->bitbuf_no = 0;
++ }
++ else
++ {
++ sem_post(&rpi->bitbuf_sem);
++ av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n");
++ rv = AVERROR_BUFFER_TOO_SMALL;
++ }
++
++ tend_phase(rpi, 0);
++ post_phase(rpi, de, 0);
++
++ if (status < 0)
++ goto fail;
++
++ // Phase 1 ---------------------------------------------------------------
++
++ wait_phase(rpi, de, 1);
++ rpi_sem_wait(&rpi->coeffbuf_sem);
++ coeffbuf_sem_claimed = 1;
++ tstart_phase(rpi, 1);
++
++ status = 0;
++ for (;;)
++ {
++ // (Re-)allocate PU/COEFF stream space
++ const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes;
++ unsigned int pu_size;
++
++ pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc;
++ pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY);
++ pu_size = pu_stride * de->PicHeightInCtbsY;
++
++ if (pu_size >= total_size || status == -1) {
++ GPU_MEM_PTR_T newbuf;
++
++ if (gpu_malloc_uncached(round_up_size(total_size + 1), &newbuf) != 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to reallocate coeffbuf\n");
++ status = -1;
++ break;
++ }
++ gpu_free(rpi->gcoeffbufs + rpi->coeffbuf_no);
++ rpi->gcoeffbufs[rpi->coeffbuf_no] = newbuf;
++ status = 0;
++ continue;
++ }
++
++ // Allocate all remaining space to coeff
++ coeff_base_vc = pu_base_vc + pu_size;
++ coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63; // Round down to multiple of 64
++
++ apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc);
++ apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride);
++ apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc);
++ apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride);
++
++ // Trigger command FIFO
++ apb_write(rpi, RPI_CFNUM, de->cmd_len);
++#if TRACE_DEV && 0
++ apb_dump_regs(rpi, 0x0, 32);
++ apb_dump_regs(rpi, 0x8000, 24);
++ axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD));
++#endif
++ apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc);
++
++ int_wait(rpi, 1);
++
++ status = check_status(rpi, de);
++
++ if (status == -1)
++ continue;
++ else if (status != 1)
++ break;
++
++ // Status 1 means out of PU space so try again with more
++ // If we ran out of Coeff space then we are out of memory - we could possibly realloc?
++ rpi->max_pu_msgs += rpi->max_pu_msgs / 2;
++ }
++
++ // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we
++ // may reuse a live buffer when we kick the coeff sem
++ if (status == 0)
++ {
++ if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS)
++ rpi->coeffbuf_no = 0;
++ }
++ else
++ {
++ if (status == -1)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs);
++ rv = AVERROR_BUFFER_TOO_SMALL;
++ }
++ else
++ {
++ av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n");
++ rv = AVERROR_INVALIDDATA;
++ }
++ }
++
++ tend_phase(rpi, 1);
++ sem_post(&rpi->bitbuf_sem);
++ post_phase(rpi, de, 1);
++
++ if (status != 0)
++ goto fail;
++
++ // Phase 2 ---------------------------------------------------------------
++
++ wait_phase(rpi, de, 2);
++
++ if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0)
++ {
++ // As we are in phase 2 already here we don't need to worry about
++ // ceoffbuf_no despite the early exit
++ post_phase(rpi, de, 2);
++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n");
++ goto fail;
++ }
++
++ tstart_phase(rpi, 2);
++
++ apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc);
++ apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride);
++ apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc);
++ apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride);
++
++ apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f));
++ apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f));
++ apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128);
++ apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128);
++
++ // Keep the last thing we resolved as fallback for any ref we fail to
++ // resolve. As a final fallback use our current frame. The pels might
++ // not be there yet but at least the memory is valid.
++ //
++ // Attempt to resolve the entire DPB - we could note what we have used
++ // in ref lists but probably simpler and more reliable to set the whole thing
++ {
++ AVFrame * fallback_frame = f;
++ for (i = 0; i != 16; ++i) {
++ // Avoid current frame
++ const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i;
++ AVFrame * fr = hevc_fr->frame;
++
++ if (fr != NULL &&
++ av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0)
++ {
++ fallback_frame = fr;
++ }
++ else
++ {
++ fr = fallback_frame;
++ }
++
++ apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr));
++ apb_write(rpi, 0x9004+16*i, 0);
++ apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr));
++ apb_write(rpi, 0x900C+16*i, 0);
++ }
++ }
++
++ apb_write(rpi, RPI_CONFIG2,
++ (sps->bit_depth << 0) // BitDepthY
++ + (sps->bit_depth << 4) // BitDepthC
++ + ((sps->bit_depth>8) << 8) // BitDepthY
++ + ((sps->bit_depth>8) << 9) // BitDepthC
++ + (sps->log2_ctb_size <<10)
++ + (pps->constrained_intra_pred_flag <<13)
++ + (sps->sps_strong_intra_smoothing_enable_flag<<14)
++ + (sps->sps_temporal_mvp_enabled_flag <<15)
++ + (pps->log2_parallel_merge_level <<16)
++ + (s->sh.slice_temporal_mvp_enabled_flag <<19)
++ + (sps->pcm.loop_filter_disable_flag <<20)
++ + ((pps->cb_qp_offset&31) <<21)
++ + ((pps->cr_qp_offset&31) <<26));
++
++ apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width);
++ apb_write(rpi, RPI_CURRPOC, s->poc);
++
++ // collocated reads/writes
++ if (sps->sps_temporal_mvp_enabled_flag) {
++ av_assert0(de->dpbno_col < RPIVID_COL_PICS);
++ av_assert0(dpbno_cur < RPIVID_COL_PICS);
++
++ apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride);
++ apb_write_vc_len(rpi, RPI_MVSTRIDE, rpi->col_stride);
++ apb_write_vc_addr(rpi, RPI_MVBASE, rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize);
++ apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize);
++ }
++
++#if TRACE_DEV && 0
++ apb_dump_regs(rpi, 0x0, 32);
++ apb_dump_regs(rpi, 0x8000, 24);
++#endif
++
++ apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY);
++ apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block
++
++ int_wait(rpi, 2);
++
++ tend_phase(rpi, 2);
++ coeffbuf_sem_claimed = 0;
++ sem_post(&rpi->coeffbuf_sem);
++ // Set valid here to avoid race in resolving in any pending phase 2
++ av_rpi_zc_set_valid_frame(f);
++
++ post_phase(rpi, de, 2);
++
++ // Flush frame for CPU access
++ // Arguably the best place would be at the start of phase 2 but here
++ // will overlap with the wait
++ //
++ // * Even better would be to have better lock/unlock control in ZC for external access
++ if (rpi->gpu_init_type == GPU_INIT_GPU) // * CMA is currently always uncached
++ {
++ rpi_cache_buf_t cbuf;
++ rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf);
++ rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE);
++ rpi_cache_flush_finish(fe);
++ }
++
++#if TRACE_ENTRY
++ printf(">>> %s[%p] OK\n", __func__, de);
++#endif
++
++ dec_env_release(rpi, de);
++ return 0;
++
++fail:
++ av_rpi_zc_set_broken_frame(f);
++ if (coeffbuf_sem_claimed)
++ sem_post(&rpi->coeffbuf_sem);
++ abort_phases(rpi, de); // Dummy any unresolved phases
++
++#if TRACE_ENTRY
++ printf(">>> %s[%p] FAIL\n", __func__, de);
++#endif
++
++ dec_env_release(rpi, de);
++ return rv;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++
++#if TRACE_DEV
++static void dump_data(const uint8_t * p, size_t len)
++{
++ size_t i;
++ for (i = 0; i < len; i += 16) {
++ size_t j;
++ printf("%04x", i);
++ for (j = 0; j != 16; ++j) {
++ printf("%c%02x", i == 8 ? '-' : ' ', p[i+j]);
++ }
++ printf("\n");
++ }
++}
++#endif
++
++#if OPT_EMU
++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
++{
++ unsigned int z = 0;
++ while (idx--) {
++ if (*b++ == 0) {
++ ++z;
++ if (z >= 2 && *b == 3) {
++ ++b;
++ z = 0;
++ }
++ }
++ else {
++ z = 0;
++ }
++ }
++ return b;
++}
++#endif
++
++static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) {
++ const int rpi_use_emu = OPT_EMU; // FFmpeg removes emulation prevention bytes
++ const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware
++ const GetBitContext *gb = &s->HEVClc->gb;
++
++#if OPT_EMU
++ const uint8_t *ptr = ptr_from_index(de->nal_buffer, gb->index/8 + 1);
++ const int len = de->nal_size - (ptr - de->nal_buffer);
++#else
++ const int len = 1 + gb->size_in_bits/8 - gb->index/8;
++ const void *ptr = &gb->buffer[gb->index/8];
++#endif
++
++#if TRACE_DEV
++ printf("Index=%d, /8=%#x\n", gb->index, gb->index/8);
++ dump_data(de->nal_buffer, 128);
++#endif
++
++ p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later
++ p1_apb_write(de, RPI_BFNUM, len);
++ p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop
++ p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Wavefront mode
++
++static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts)
++{
++ const HEVCPPS * const pps = s->ps.pps;
++
++ int i, resetQPY=1;
++ int indep = !s->sh.dependent_slice_segment_flag;
++ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++
++ if (ctb_addr_ts)
++ wpp_end_previous_slice(de, s, ctb_addr_ts);
++ pre_slice_decode(de, s);
++ WriteBitstream(de, s);
++ if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1)
++ WriteProb(de, s);
++ else if (ctb_col==0)
++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
++ else
++ resetQPY=0;
++ program_slicecmds(de, s->slice_idx);
++ new_slice_segment(de, s);
++ wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts);
++ for (i=0; i<s->sh.num_entry_point_offsets; i++) {
++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
++ int last_x = de->PicWidthInCtbsY-1;
++ if (de->PicWidthInCtbsY>2)
++ wpp_pause(de, ctb_row);
++ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2);
++ if (de->PicWidthInCtbsY==2)
++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++ if (de->PicWidthInCtbsY==1)
++ WriteProb(de, s);
++ else
++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
++ ctb_addr_ts += pps->column_width[0];
++ wpp_entry_point(de, s, 0, 1, ctb_addr_ts);
++ }
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Tiles mode
++
++static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
++ const HEVCPPS * const pps = s->ps.pps;
++ int i, resetQPY;
++
++ if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts);
++ pre_slice_decode(de, s);
++ WriteBitstream(de, s);
++ resetQPY = ctb_addr_ts==0
++ || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1]
++ || !s->sh.dependent_slice_segment_flag;
++ if (resetQPY) WriteProb(de, s);
++ program_slicecmds(de, s->slice_idx);
++ new_slice_segment(de, s);
++ new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts);
++ for (i=0; i<s->sh.num_entry_point_offsets; i++) {
++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY;
++ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
++ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
++ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
++ int last_x = pps->col_bd[tile_x+1]-1;
++ int last_y = pps->row_bd[tile_y+1]-1;
++ p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18));
++ WriteProb(de, s);
++ ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y];
++ new_entry_point(de, s, 0, 1, ctb_addr_ts);
++ }
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int cabac_start_align(HEVCContext *s)
++{
++ GetBitContext *gb = &s->HEVClc->gb;
++ skip_bits(gb, 1);
++ align_get_bits(gb);
++ // Should look at getting rid of this
++ return ff_init_cabac_decoder(&s->HEVClc->cc,
++ gb->buffer + get_bits_count(gb) / 8,
++ (get_bits_left(gb) + 7) / 8);
++}
++
++static int rpi_hevc_decode_slice(
++ AVCodecContext *avctx,
++ const uint8_t *buffer,
++ uint32_t size)
++{
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ HEVCContext * const s = avctx->priv_data;
++ dec_env_t * const de = dec_env_get(avctx, rpi);
++ const HEVCPPS *pps = s->ps.pps;
++ int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
++
++#if TRACE_ENTRY
++ printf("<<< %s[%p]\n", __func__, de);
++#endif
++ if (de == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++ return -1;
++ }
++
++ if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
++ return -1;
++ }
++ de->state = RPIVID_DECODE_SLICE;
++
++ de->nal_buffer = buffer;
++ de->nal_size = size;
++
++#if !OPT_EMU
++// ff_hevc_cabac_init(s, ctb_addr_ts);
++ cabac_start_align(s);
++#endif
++ if (s->ps.sps->scaling_list_enable_flag)
++ populate_scaling_factors(de, s);
++ pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts)
++ : decode_slice(de, s, ctb_addr_ts);
++#if TRACE_ENTRY
++ printf(">>> %s[%p]\n", __func__, de);
++#endif
++ dec_env_release(rpi, de);
++ return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpivid_retrieve_data(void *logctx, AVFrame *frame)
++{
++ int rv;
++ if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0)
++ av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n");
++ return rv;
++}
++
++static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++ HEVCContext * const s = avctx->priv_data;
++ // Frame buffering + 1 output. Would need thread_count extra but we now
++ // alloc at the start of phase 2 so that is the only thread we need the
++ // extra buffer for.
++ const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1;
++ int rv;
++
++ if (av_rpi_zc_in_use(avctx))
++ {
++ const AVZcEnvPtr zc = avctx->opaque;
++ av_rpi_zc_set_decoder_pool_size(zc, pool_req);
++ rv = av_rpi_zc_get_buffer(zc, frame); // get_buffer2 would alloc
++ }
++ else
++ {
++ if (rpi->zc == NULL) {
++ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
++ // Alloc inside lock to make sure we only ever alloc one
++ if (rpi->zc == NULL) {
++ rpi->zc = av_rpi_zc_int_env_alloc(s);
++ }
++ pthread_mutex_unlock(&rpi->phase_lock);
++ }
++ av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-)
++ rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) :
++ av_rpi_zc_get_buffer(rpi->zc, frame);
++ }
++
++ if (rv == 0 &&
++ (rv = ff_attach_decode_data(frame)) < 0)
++ {
++ av_frame_unref(frame);
++ }
++
++ if (rv == 0)
++ {
++ FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
++ fdd->post_process = rpivid_retrieve_data;
++ }
++
++ return rv;
++}
++
++#if OPT_PHASE_TIMING
++static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins)
++{
++ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n",
++ bins[0], bins[1], bins[2], bins[3],
++ bins[4], bins[5], bins[6], bins[7], bins[8]);
++}
++#endif
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpi_hevc_free(AVCodecContext *avctx) {
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++
++#if TRACE_ENTRY
++ printf("<<< %s\n", __func__);
++#endif
++
++ dec_env_release(rpi, NULL);
++
++ // Wait for everything else to stop
++ {
++ struct timespec tt;
++ clock_gettime(CLOCK_REALTIME, &tt);
++ tt.tv_sec += 2;
++ while (sem_timedwait(&rpi->ref_zero, &tt) == -1) {
++ const int err = errno;
++ if (err == ETIMEDOUT) {
++ av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n");
++ return -1;
++ }
++ if (err != EINTR) {
++ av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err);
++ break;
++ }
++ }
++ }
++
++#if OPT_PHASE_TIMING
++ {
++ unsigned int i;
++ for (i = 0; i != RPIVID_PHASES; ++i) {
++ const phase_wait_env_t * const p = rpi->phase_reqs + i;
++ av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i,
++ (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000),
++ (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000));
++ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d >\n",
++ time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3],
++ time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]);
++ log_bin_phase(avctx, p->time_bins);
++ log_bin_phase(avctx, p->time_bins3);
++ log_bin_phase(avctx, p->time_bins5);
++ av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n",
++ (unsigned int)(p->max_phase_time / 1000),
++ p->max_time_decode_order);
++ }
++ av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs);
++ }
++#endif
++
++ if (rpi->dec_envs != NULL)
++ {
++ for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) {
++ dec_env_delete(rpi->dec_envs[i]);
++ }
++ av_freep(&rpi->dec_envs);
++ }
++
++ av_rpi_zc_int_env_freep(&rpi->zc);
++
++ gpu_free(&rpi->gcolbuf);
++
++ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
++ gpu_free(rpi->gbitbufs + i);
++ }
++ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
++ gpu_free(rpi->gcoeffbufs + i);
++ }
++
++ unmap_devp(&rpi->regs, REGS_SIZE);
++ unmap_devp(&rpi->ints, INTS_SIZE);
++
++ if (rpi->gpu_init_type > 0)
++ rpi_mem_gpu_uninit();
++
++ if (rpi->mbox_fd >= 0) {
++ mbox_release_clock(rpi->mbox_fd);
++ mbox_close(rpi->mbox_fd);
++ }
++
++ sem_destroy(&rpi->ref_zero);
++ sem_destroy(&rpi->coeffbuf_sem);
++ sem_destroy(&rpi->bitbuf_sem);
++
++#if TRACE_ENTRY
++ printf(">>> %s\n", __func__);
++#endif
++ return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpi_hevc_init(AVCodecContext *avctx) {
++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++// const char *err;
++
++#if TRACE_ENTRY
++ printf("<<< %s\n", __func__);
++#endif
++
++ if (avctx->width>4096 || avctx->height>4096) {
++ av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height);
++ return AVERROR(ENOTSUP);
++ }
++
++ memset(rpi, 0, sizeof(*rpi));
++
++ rpi->mbox_fd = -1;
++ rpi->decode_order = 0;
++
++ // Initial PU/COEFF stream buffer split chosen as worst case seen so far
++ rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU
++
++
++ atomic_store(&rpi->ref_count, 1);
++ sem_init(&rpi->ref_zero, 0, 0);
++
++ sem_init(&rpi->bitbuf_sem, 0, RPIVID_BITBUFS);
++ sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS);
++
++ pthread_mutex_init(&rpi->phase_lock, NULL);
++
++ if ((rpi->mbox_fd = mbox_open()) < 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n");
++ goto fail;
++ }
++ mbox_request_clock(rpi->mbox_fd);
++
++ if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL ||
++ (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n");
++ goto fail;
++ }
++
++ if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n");
++ goto fail;
++ }
++
++ if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count);
++ goto fail;
++ }
++
++ rpi->col_stride = rnd64(avctx->width);
++ rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4);
++ if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n");
++ goto fail;
++ }
++
++ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
++ if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i);
++ goto fail;
++ }
++ }
++
++ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
++ if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0)
++ {
++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i);
++ goto fail;
++ }
++ }
++
++ av_log(avctx, AV_LOG_INFO, "RPI HEVC h/w accel init OK\n");
++
++ return 0;
++
++fail:
++ rpi_hevc_free(avctx);
++ return AVERROR_EXTERNAL;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++const AVHWAccel ff_hevc_rpi4_8_hwaccel = {
++ .name = "hevc_rpi4_8",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .id = AV_CODEC_ID_HEVC,
++ .pix_fmt = AV_PIX_FMT_RPI4_8,
++ .alloc_frame = rpivid_hevc_alloc_frame,
++ .start_frame = rpi_hevc_start_frame,
++ .end_frame = rpi_hevc_end_frame,
++ .abort_frame = rpi_hevc_abort_frame,
++ .decode_slice = rpi_hevc_decode_slice,
++ .init = rpi_hevc_init,
++ .uninit = rpi_hevc_free,
++ .priv_data_size = sizeof(RPI_T),
++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
++const AVHWAccel ff_hevc_rpi4_10_hwaccel = {
++ .name = "hevc_rpi4_10",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .id = AV_CODEC_ID_HEVC,
++ .pix_fmt = AV_PIX_FMT_RPI4_10,
++ .alloc_frame = rpivid_hevc_alloc_frame,
++ .start_frame = rpi_hevc_start_frame,
++ .end_frame = rpi_hevc_end_frame,
++ .abort_frame = rpi_hevc_abort_frame,
++ .decode_slice = rpi_hevc_decode_slice,
++ .init = rpi_hevc_init,
++ .uninit = rpi_hevc_free,
++ .priv_data_size = sizeof(RPI_T),
++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
+--- a/libavcodec/v4l2_buffers.c
++++ b/libavcodec/v4l2_buffers.c
+@@ -21,6 +21,7 @@
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
++#include <drm_fourcc.h>
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <sys/mman.h>
+@@ -29,57 +30,82 @@
+ #include <poll.h>
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
++#include "libavutil/avassert.h"
+ #include "libavutil/pixdesc.h"
++#include "libavutil/hwcontext.h"
+ #include "v4l2_context.h"
+ #include "v4l2_buffers.h"
+ #include "v4l2_m2m.h"
++#include "weak_link.h"
+
+ #define USEC_PER_SEC 1000000
+-static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
++static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
+
+-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
++static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
+ {
+ return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
+ container_of(buf->context, V4L2m2mContext, output) :
+ container_of(buf->context, V4L2m2mContext, capture);
+ }
+
+-static inline AVCodecContext *logger(V4L2Buffer *buf)
++static inline AVCodecContext *logger(const V4L2Buffer * const buf)
+ {
+ return buf_to_m2mctx(buf)->avctx;
+ }
+
+-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
++static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
+ {
+- V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+-
+- if (s->avctx->pkt_timebase.num)
+- return s->avctx->pkt_timebase;
+- return s->avctx->time_base;
++ const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
++ const AVRational tb = s->avctx->pkt_timebase.num ?
++ s->avctx->pkt_timebase :
++ s->avctx->time_base;
++ return tb.num && tb.den ? tb : v4l2_timebase;
+ }
+
+-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
++static inline struct timeval tv_from_int(const int64_t t)
+ {
+- int64_t v4l2_pts;
++ return (struct timeval){
++ .tv_usec = t % USEC_PER_SEC,
++ .tv_sec = t / USEC_PER_SEC
++ };
++}
+
+- if (pts == AV_NOPTS_VALUE)
+- pts = 0;
++static inline int64_t int_from_tv(const struct timeval t)
++{
++ return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
++}
+
++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
++{
+ /* convert pts to v4l2 timebase */
+- v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+- out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
+- out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
++ const int64_t v4l2_pts =
++ pts == AV_NOPTS_VALUE ? 0 :
++ av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
++ out->buf.timestamp = tv_from_int(v4l2_pts);
+ }
+
+-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
++static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
+ {
+- int64_t v4l2_pts;
+-
++ const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
++ return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
++#if 0
+ /* convert pts back to encoder timebase */
+- v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
+- avbuf->buf.timestamp.tv_usec;
++ return
++ avbuf->context->no_pts_rescale ? v4l2_pts :
++ v4l2_pts == 0 ? AV_NOPTS_VALUE :
++ av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++#endif
++}
+
+- return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
++{
++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++ out->planes[plane].bytesused = bytesused;
++ out->planes[plane].length = length;
++ } else {
++ out->buf.bytesused = bytesused;
++ out->buf.length = length;
++ }
+ }
+
+ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
+@@ -116,49 +142,176 @@ static enum AVColorPrimaries v4l2_get_co
+ return AVCOL_PRI_UNSPECIFIED;
+ }
+
+-static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
+-{
+- enum v4l2_quantization qt;
++static void v4l2_set_color(V4L2Buffer *buf,
++ const enum AVColorPrimaries avcp,
++ const enum AVColorSpace avcs,
++ const enum AVColorTransferCharacteristic avxc)
++{
++ enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
++ enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
++ enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
++
++ switch (avcp) {
++ case AVCOL_PRI_BT709:
++ cs = V4L2_COLORSPACE_REC709;
++ ycbcr = V4L2_YCBCR_ENC_709;
++ break;
++ case AVCOL_PRI_BT470M:
++ cs = V4L2_COLORSPACE_470_SYSTEM_M;
++ ycbcr = V4L2_YCBCR_ENC_601;
++ break;
++ case AVCOL_PRI_BT470BG:
++ cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++ break;
++ case AVCOL_PRI_SMPTE170M:
++ cs = V4L2_COLORSPACE_SMPTE170M;
++ break;
++ case AVCOL_PRI_SMPTE240M:
++ cs = V4L2_COLORSPACE_SMPTE240M;
++ break;
++ case AVCOL_PRI_BT2020:
++ cs = V4L2_COLORSPACE_BT2020;
++ break;
++ case AVCOL_PRI_SMPTE428:
++ case AVCOL_PRI_SMPTE431:
++ case AVCOL_PRI_SMPTE432:
++ case AVCOL_PRI_EBU3213:
++ case AVCOL_PRI_RESERVED:
++ case AVCOL_PRI_FILM:
++ case AVCOL_PRI_UNSPECIFIED:
++ default:
++ break;
++ }
+
+- qt = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+- buf->context->format.fmt.pix_mp.quantization :
+- buf->context->format.fmt.pix.quantization;
++ switch (avcs) {
++ case AVCOL_SPC_RGB:
++ cs = V4L2_COLORSPACE_SRGB;
++ break;
++ case AVCOL_SPC_BT709:
++ cs = V4L2_COLORSPACE_REC709;
++ break;
++ case AVCOL_SPC_FCC:
++ cs = V4L2_COLORSPACE_470_SYSTEM_M;
++ break;
++ case AVCOL_SPC_BT470BG:
++ cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++ break;
++ case AVCOL_SPC_SMPTE170M:
++ cs = V4L2_COLORSPACE_SMPTE170M;
++ break;
++ case AVCOL_SPC_SMPTE240M:
++ cs = V4L2_COLORSPACE_SMPTE240M;
++ break;
++ case AVCOL_SPC_BT2020_CL:
++ cs = V4L2_COLORSPACE_BT2020;
++ ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
++ break;
++ case AVCOL_SPC_BT2020_NCL:
++ cs = V4L2_COLORSPACE_BT2020;
++ break;
++ default:
++ break;
++ }
+
+- switch (qt) {
+- case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
+- case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
++ switch (xfer) {
++ case AVCOL_TRC_BT709:
++ xfer = V4L2_XFER_FUNC_709;
++ break;
++ case AVCOL_TRC_IEC61966_2_1:
++ xfer = V4L2_XFER_FUNC_SRGB;
++ break;
++ case AVCOL_TRC_SMPTE240M:
++ xfer = V4L2_XFER_FUNC_SMPTE240M;
++ break;
++ case AVCOL_TRC_SMPTE2084:
++ xfer = V4L2_XFER_FUNC_SMPTE2084;
++ break;
+ default:
+ break;
+ }
+
+- return AVCOL_RANGE_UNSPECIFIED;
++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
++ buf->context->format.fmt.pix_mp.colorspace = cs;
++ buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
++ buf->context->format.fmt.pix_mp.xfer_func = xfer;
++ } else {
++ buf->context->format.fmt.pix.colorspace = cs;
++ buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
++ buf->context->format.fmt.pix.xfer_func = xfer;
++ }
+ }
+
+-static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
++static inline enum v4l2_quantization
++buf_quantization(const V4L2Buffer * const buf)
+ {
+- enum v4l2_ycbcr_encoding ycbcr;
+- enum v4l2_colorspace cs;
++ return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
++ buf->context->format.fmt.pix_mp.quantization :
++ buf->context->format.fmt.pix.quantization;
++}
+
+- cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
++static inline enum v4l2_colorspace
++buf_colorspace(const V4L2Buffer * const buf)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+ buf->context->format.fmt.pix_mp.colorspace :
+ buf->context->format.fmt.pix.colorspace;
++}
+
+- ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
++static inline enum v4l2_ycbcr_encoding
++buf_ycbcr_enc(const V4L2Buffer * const buf)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+ buf->context->format.fmt.pix_mp.ycbcr_enc:
+ buf->context->format.fmt.pix.ycbcr_enc;
++}
+
+- switch(cs) {
+- case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
++static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
++{
++ switch (buf_quantization(buf)) {
++ case V4L2_QUANTIZATION_LIM_RANGE:
++ return AVCOL_RANGE_MPEG;
++ case V4L2_QUANTIZATION_FULL_RANGE:
++ return AVCOL_RANGE_JPEG;
++ case V4L2_QUANTIZATION_DEFAULT:
++ // If YUV (which we assume for all video decode) then, from the header
++ // comments, range is limited unless CS is JPEG
++ return buf_colorspace(buf) == V4L2_COLORSPACE_JPEG ?
++ AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
++ default:
++ break;
++ }
++
++ return AVCOL_RANGE_UNSPECIFIED;
++}
++
++static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
++{
++ const enum v4l2_quantization q =
++ avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
++ avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
++ V4L2_QUANTIZATION_DEFAULT;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
++ buf->context->format.fmt.pix_mp.quantization = q;
++ } else {
++ buf->context->format.fmt.pix.quantization = q;
++ }
++}
++
++static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
++{
++ switch (buf_colorspace(buf)) {
++ case V4L2_COLORSPACE_JPEG: // JPEG -> SRGB
++ case V4L2_COLORSPACE_SRGB:
++ return AVCOL_SPC_RGB;
+ case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
+ case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
+ case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
+ case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
+ case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
+ case V4L2_COLORSPACE_BT2020:
+- if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
+- return AVCOL_SPC_BT2020_CL;
+- else
+- return AVCOL_SPC_BT2020_NCL;
++ return buf_ycbcr_enc(buf) == V4L2_YCBCR_ENC_BT2020_CONST_LUM ?
++ AVCOL_SPC_BT2020_CL : AVCOL_SPC_BT2020_NCL;
+ default:
+ break;
+ }
+@@ -168,17 +321,9 @@ static enum AVColorSpace v4l2_get_color_
+
+ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
+ {
+- enum v4l2_ycbcr_encoding ycbcr;
++ const enum v4l2_ycbcr_encoding ycbcr = buf_ycbcr_enc(buf);
+ enum v4l2_xfer_func xfer;
+- enum v4l2_colorspace cs;
+-
+- cs = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+- buf->context->format.fmt.pix_mp.colorspace :
+- buf->context->format.fmt.pix.colorspace;
+-
+- ycbcr = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+- buf->context->format.fmt.pix_mp.ycbcr_enc:
+- buf->context->format.fmt.pix.ycbcr_enc;
++ const enum v4l2_colorspace cs = buf_colorspace(buf);
+
+ xfer = V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type) ?
+ buf->context->format.fmt.pix_mp.xfer_func:
+@@ -210,73 +355,165 @@ static enum AVColorTransferCharacteristi
+ return AVCOL_TRC_UNSPECIFIED;
+ }
+
+-static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
+ {
+- V4L2Buffer* avbuf = opaque;
+- V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+-
+- if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
+- atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
++ return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
++}
+
+- if (s->reinit) {
+- if (!atomic_load(&s->refcount))
+- sem_post(&s->refsync);
+- } else {
+- if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
+- /* no need to queue more buffers to the driver */
+- avbuf->status = V4L2BUF_AVAILABLE;
+- }
+- else if (avbuf->context->streamon)
+- ff_v4l2_buffer_enqueue(avbuf);
+- }
++static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
++{
++ return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
++}
+
+- av_buffer_unref(&avbuf->context_ref);
+- }
++static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
++{
++ buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
++ is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
+ }
+
+-static int v4l2_buf_increase_ref(V4L2Buffer *in)
++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
+ {
+- V4L2m2mContext *s = buf_to_m2mctx(in);
++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
++ AVDRMLayerDescriptor *layer;
+
+- if (in->context_ref)
+- atomic_fetch_add(&in->context_refcount, 1);
+- else {
+- in->context_ref = av_buffer_ref(s->self_ref);
+- if (!in->context_ref)
+- return AVERROR(ENOMEM);
++ /* fill the DRM frame descriptor */
++ drm_desc->nb_objects = avbuf->num_planes;
++ drm_desc->nb_layers = 1;
+
+- in->context_refcount = 1;
++ layer = &drm_desc->layers[0];
++ layer->nb_planes = avbuf->num_planes;
++
++ for (int i = 0; i < avbuf->num_planes; i++) {
++ layer->planes[i].object_index = i;
++ layer->planes[i].offset = 0;
++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
+ }
+
+- in->status = V4L2BUF_RET_USER;
+- atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
++ switch (avbuf->context->av_pix_fmt) {
++ case AV_PIX_FMT_YUYV422:
++
++ layer->format = DRM_FORMAT_YUYV;
++ layer->nb_planes = 1;
+
+- return 0;
++ break;
++
++ case AV_PIX_FMT_NV12:
++ case AV_PIX_FMT_NV21:
++
++ layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
++ DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
++
++ if (avbuf->num_planes > 1)
++ break;
++
++ layer->nb_planes = 2;
++
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++ avbuf->context->format.fmt.pix.height;
++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
++ break;
++
++ case AV_PIX_FMT_YUV420P:
++
++ layer->format = DRM_FORMAT_YUV420;
++
++ if (avbuf->num_planes > 1)
++ break;
++
++ layer->nb_planes = 3;
++
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++ avbuf->context->format.fmt.pix.height;
++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
++
++ layer->planes[2].object_index = 0;
++ layer->planes[2].offset = layer->planes[1].offset +
++ ((avbuf->plane_info[0].bytesperline *
++ avbuf->context->format.fmt.pix.height) >> 2);
++ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
++ break;
++
++ default:
++ drm_desc->nb_layers = 0;
++ break;
++ }
++
++ return (uint8_t *) drm_desc;
+ }
+
+-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
++static void v4l2_free_bufref(void *opaque, uint8_t *data)
+ {
+- int ret;
++ AVBufferRef * bufref = (AVBufferRef *)data;
++ V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
++ struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);
+
+- if (plane >= in->num_planes)
+- return AVERROR(EINVAL);
++ if (ctx != NULL) {
++ // Buffer still attached to context
++ V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+
+- /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
+- *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
+- in->plane_info[plane].length, v4l2_free_buffer, in, 0);
+- if (!*buf)
+- return AVERROR(ENOMEM);
++ ff_mutex_lock(&ctx->lock);
+
+- ret = v4l2_buf_increase_ref(in);
+- if (ret)
+- av_buffer_unref(buf);
++ ff_v4l2_buffer_set_avail(avbuf);
+
+- return ret;
++ if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
++ /* no need to queue more buffers to the driver */
++ }
++ else if (ctx->streamon) {
++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
++ avbuf->buf.timestamp.tv_sec = 0;
++ avbuf->buf.timestamp.tv_usec = 0;
++ ff_v4l2_buffer_enqueue(avbuf); // will set to IN_DRIVER
++ }
++ else {
++ av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
++ }
++
++ ff_mutex_unlock(&ctx->lock);
++ }
++
++ ff_weak_link_unlock(avbuf->context_wl);
++ av_buffer_unref(&bufref);
+ }
+
+-static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref)
++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
++{
++ struct v4l2_exportbuffer expbuf;
++ int i, ret;
++
++ for (i = 0; i < avbuf->num_planes; i++) {
++ memset(&expbuf, 0, sizeof(expbuf));
++
++ expbuf.index = avbuf->buf.index;
++ expbuf.type = avbuf->buf.type;
++ expbuf.plane = i;
++
++ ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
++ if (ret < 0)
++ return AVERROR(errno);
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
++ /* drm frame */
++ avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
++ avbuf->drm_frame.objects[i].fd = expbuf.fd;
++ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ } else {
++ /* drm frame */
++ avbuf->drm_frame.objects[0].size = avbuf->buf.length;
++ avbuf->drm_frame.objects[0].fd = expbuf.fd;
++ avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ }
++ }
++
++ return 0;
++}
++
++static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
+ {
+ unsigned int bytesused, length;
++ int rv = 0;
+
+ if (plane >= out->num_planes)
+ return AVERROR(EINVAL);
+@@ -284,32 +521,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer
+ length = out->plane_info[plane].length;
+ bytesused = FFMIN(size+offset, length);
+
+- memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
+-
+- if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+- out->planes[plane].bytesused = bytesused;
+- out->planes[plane].length = length;
+- } else {
+- out->buf.bytesused = bytesused;
+- out->buf.length = length;
++ if (size > length - offset) {
++ size = length - offset;
++ rv = AVERROR(ENOMEM);
+ }
+
+- return 0;
++ memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
++
++ set_buf_length(out, plane, bytesused, length);
++
++ return rv;
++}
++
++static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
++{
++ AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
++ AVBufferRef * newbuf;
++
++ if (!bufref)
++ return NULL;
++
++ newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
++ if (newbuf == NULL)
++ av_buffer_unref(&bufref);
++
++ avbuf->status = V4L2BUF_RET_USER;
++ return newbuf;
+ }
+
+ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+ {
+- int i, ret;
++ int i;
+
+ frame->format = avbuf->context->av_pix_fmt;
+
+- for (i = 0; i < avbuf->num_planes; i++) {
+- ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
+- if (ret)
+- return ret;
++ frame->buf[0] = wrap_avbuf(avbuf);
++ if (frame->buf[0] == NULL)
++ return AVERROR(ENOMEM);
++
++ if (buf_to_m2mctx(avbuf)->output_drm) {
++ /* 1. get references to the actual data */
++ frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
++ frame->format = AV_PIX_FMT_DRM_PRIME;
++ frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
++ return 0;
++ }
+
++
++ /* 1. get references to the actual data */
++ for (i = 0; i < avbuf->num_planes; i++) {
++ frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
+ frame->linesize[i] = avbuf->plane_info[i].bytesperline;
+- frame->data[i] = frame->buf[i]->data;
+ }
+
+ /* fixup special cases */
+@@ -318,17 +580,17 @@ static int v4l2_buffer_buf_to_swframe(AV
+ case AV_PIX_FMT_NV21:
+ if (avbuf->num_planes > 1)
+ break;
+- frame->linesize[1] = avbuf->plane_info[0].bytesperline;
+- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
++ frame->linesize[1] = frame->linesize[0];
++ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
+ break;
+
+ case AV_PIX_FMT_YUV420P:
+ if (avbuf->num_planes > 1)
+ break;
+- frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
+- frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
+- frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
+- frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
++ frame->linesize[1] = frame->linesize[0] / 2;
++ frame->linesize[2] = frame->linesize[1];
++ frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
++ frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
+ break;
+
+ default:
+@@ -338,68 +600,127 @@ static int v4l2_buffer_buf_to_swframe(AV
+ return 0;
+ }
+
++static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
++{
++ if (dst_stride == src_stride && w + 32 >= dst_stride) {
++ memcpy(dst, src, dst_stride * h);
++ }
++ else {
++ while (--h >= 0) {
++ memcpy(dst, src, w);
++ dst += dst_stride;
++ src += src_stride;
++ }
++ }
++}
++
++static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
++{
++ return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
++}
++
++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++{
++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++ if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
++ return AVERROR(EINVAL);
++
++ av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++ // Only currently cope with single buffer types
++ if (out->buf.length != 1)
++ return AVERROR_PATCHWELCOME;
++ if (src->nb_objects != 1)
++ return AVERROR(EINVAL);
++
++ out->planes[0].m.fd = src->objects[0].fd;
++ }
++ else {
++ if (src->nb_objects != 1)
++ return AVERROR(EINVAL);
++
++ out->buf.m.fd = src->objects[0].fd;
++ }
++
++ // No need to copy src AVDescriptor and if we did then we may confuse
++ // fd close on free
++ out->ref_buf = av_buffer_ref(frame->buf[0]);
++
++ return 0;
++}
++
+ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+ {
+- int i, ret;
+- struct v4l2_format fmt = out->context->format;
+- int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
+- fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
+- int height = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
+- fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
+- int is_planar_format = 0;
+-
+- switch (pixel_format) {
+- case V4L2_PIX_FMT_YUV420M:
+- case V4L2_PIX_FMT_YVU420M:
+-#ifdef V4L2_PIX_FMT_YUV422M
+- case V4L2_PIX_FMT_YUV422M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YVU422M
+- case V4L2_PIX_FMT_YVU422M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YUV444M
+- case V4L2_PIX_FMT_YUV444M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YVU444M
+- case V4L2_PIX_FMT_YVU444M:
+-#endif
+- case V4L2_PIX_FMT_NV12M:
+- case V4L2_PIX_FMT_NV21M:
+- case V4L2_PIX_FMT_NV12MT_16X16:
+- case V4L2_PIX_FMT_NV12MT:
+- case V4L2_PIX_FMT_NV16M:
+- case V4L2_PIX_FMT_NV61M:
+- is_planar_format = 1;
+- }
+-
+- if (!is_planar_format) {
+- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+- int planes_nb = 0;
+- int offset = 0;
+-
+- for (i = 0; i < desc->nb_components; i++)
+- planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+-
+- for (i = 0; i < planes_nb; i++) {
+- int size, h = height;
+- if (i == 1 || i == 2) {
++ int i;
++ int num_planes = 0;
++ int pel_strides[4] = {0};
++
++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++
++ if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
++ av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
++ return -1;
++ }
++
++ for (i = 0; i != desc->nb_components; ++i) {
++ if (desc->comp[i].plane >= num_planes)
++ num_planes = desc->comp[i].plane + 1;
++ pel_strides[desc->comp[i].plane] = desc->comp[i].step;
++ }
++
++ if (out->num_planes > 1) {
++ if (num_planes != out->num_planes) {
++ av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
++ return -1;
++ }
++ for (i = 0; i != num_planes; ++i) {
++ int w = frame->width;
++ int h = frame->height;
++ if (is_chroma(desc, i, num_planes)) {
++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
+ h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
+ }
+- size = frame->linesize[i] * h;
+- ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset, frame->buf[i]);
+- if (ret)
+- return ret;
+- offset += size;
++
++ cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
++ frame->data[i], frame->linesize[i],
++ w * pel_strides[i], h);
++ set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
+ }
+- return 0;
+ }
++ else
++ {
++ unsigned int offset = 0;
++
++ for (i = 0; i != num_planes; ++i) {
++ int w = frame->width;
++ int h = frame->height;
++ int dst_stride = out->plane_info[0].bytesperline;
++ uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
++
++ if (is_chroma(desc, i, num_planes)) {
++ // Is chroma
++ dst_stride >>= desc->log2_chroma_w;
++ offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
++ h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
++ }
++ else {
++ // Is luma or alpha
++ offset += dst_stride * out->context->height;
++ }
++ if (offset > out->plane_info[0].length) {
++ av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
++ return -1;
++ }
+
+- for (i = 0; i < out->num_planes; i++) {
+- ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0, frame->buf[i]);
+- if (ret)
+- return ret;
++ cpy_2d(dst, dst_stride,
++ frame->data[i], frame->linesize[i],
++ w * pel_strides[i], h);
++ }
++ set_buf_length(out, 0, offset, out->plane_info[0].length);
+ }
+-
+ return 0;
+ }
+
+@@ -409,16 +730,31 @@ static int v4l2_buffer_swframe_to_buf(co
+ *
+ ******************************************************************************/
+
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
+ {
+- v4l2_set_pts(out, frame->pts);
+-
+- return v4l2_buffer_swframe_to_buf(frame, out);
++ out->buf.flags = frame->key_frame ?
++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
++ // Beware that colour info is held in format rather than the actual
++ // v4l2 buffer struct so this may not be as useful as you might hope
++ v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
++ v4l2_set_color_range(out, frame->color_range);
++ // PTS & interlace are buffer vars
++ if (track_ts)
++ out->buf.timestamp = tv_from_int(track_ts);
++ else
++ v4l2_set_pts(out, frame->pts);
++ v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
++
++ return frame->format == AV_PIX_FMT_DRM_PRIME ?
++ v4l2_buffer_primeframe_to_buf(frame, out) :
++ v4l2_buffer_swframe_to_buf(frame, out);
+ }
+
+ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+ {
+ int ret;
++ V4L2Context * const ctx = avbuf->context;
+
+ av_frame_unref(frame);
+
+@@ -429,17 +765,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+
+ /* 2. get frame information */
+ frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
++ frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
++ (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
++ (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
++ AV_PICTURE_TYPE_NONE;
+ frame->color_primaries = v4l2_get_color_primaries(avbuf);
+ frame->colorspace = v4l2_get_color_space(avbuf);
+ frame->color_range = v4l2_get_color_range(avbuf);
+ frame->color_trc = v4l2_get_color_trc(avbuf);
+ frame->pts = v4l2_get_pts(avbuf);
+ frame->pkt_dts = AV_NOPTS_VALUE;
++ frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
++ frame->top_field_first = v4l2_buf_is_top_first(avbuf);
+
+ /* these values are updated also during re-init in v4l2_process_driver_event */
+- frame->height = avbuf->context->height;
+- frame->width = avbuf->context->width;
+- frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
++ frame->height = ctx->height;
++ frame->width = ctx->width;
++ frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
++
++ if (ctx->selection.height && ctx->selection.width) {
++ frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
++ frame->crop_top = ctx->selection.top < frame->height ? ctx->selection.top : 0;
++ frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
++ frame->width - (ctx->selection.left + ctx->selection.width) : 0;
++ frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
++ frame->height - (ctx->selection.top + ctx->selection.height) : 0;
++ }
+
+ /* 3. report errors upstream */
+ if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
+@@ -452,15 +803,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFram
+
+ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
+ {
+- int ret;
+-
+ av_packet_unref(pkt);
+- ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
+- if (ret)
+- return ret;
++
++ pkt->buf = wrap_avbuf(avbuf);
++ if (pkt->buf == NULL)
++ return AVERROR(ENOMEM);
+
+ pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
+- pkt->data = pkt->buf->data;
++ pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
++ pkt->flags = 0;
+
+ if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
+ pkt->flags |= AV_PKT_FLAG_KEY;
+@@ -475,31 +826,91 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
+ return 0;
+ }
+
+-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++ const void *extdata, size_t extlen,
++ const int64_t timestamp)
+ {
+ int ret;
+
+- ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0, pkt->buf);
+- if (ret)
++ if (extlen) {
++ ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
++ if (ret)
++ return ret;
++ }
++
++ ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
++ if (ret && ret != AVERROR(ENOMEM))
+ return ret;
+
+- v4l2_set_pts(out, pkt->pts);
++ if (timestamp)
++ out->buf.timestamp = tv_from_int(timestamp);
++ else
++ v4l2_set_pts(out, pkt->pts);
++
++ out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
++ (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++ (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
+
+- if (pkt->flags & AV_PKT_FLAG_KEY)
+- out->flags = V4L2_BUF_FLAG_KEYFRAME;
++ return ret;
++}
+
+- return 0;
++int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
++{
++ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
++}
++
++
++static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
++{
++ V4L2Buffer * const avbuf = (V4L2Buffer *)data;
++ int i;
++
++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
++ struct V4L2Plane_info *p = avbuf->plane_info + i;
++ if (p->mm_addr != NULL)
++ munmap(p->mm_addr, p->length);
++ }
++
++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
++ if (avbuf->drm_frame.objects[i].fd != -1)
++ close(avbuf->drm_frame.objects[i].fd);
++ }
++
++ av_buffer_unref(&avbuf->ref_buf);
++
++ ff_weak_link_unref(&avbuf->context_wl);
++
++ av_free(avbuf);
+ }
+
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
++
++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
+ {
+- V4L2Context *ctx = avbuf->context;
+ int ret, i;
++ V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
++ AVBufferRef * bufref;
+
+- avbuf->buf.memory = V4L2_MEMORY_MMAP;
++ *pbufref = NULL;
++ if (avbuf == NULL)
++ return AVERROR(ENOMEM);
++
++ bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
++ if (bufref == NULL) {
++ av_free(avbuf);
++ return AVERROR(ENOMEM);
++ }
++
++ avbuf->context = ctx;
++ avbuf->buf.memory = mem;
+ avbuf->buf.type = ctx->type;
+ avbuf->buf.index = index;
+
++ for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
++ avbuf->drm_frame.objects[i].fd = -1;
++ }
++
++ avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
++
+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+ avbuf->buf.length = VIDEO_MAX_PLANES;
+ avbuf->buf.m.planes = avbuf->planes;
+@@ -507,7 +918,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+
+ ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
+ if (ret < 0)
+- return AVERROR(errno);
++ goto fail;
+
+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+ avbuf->num_planes = 0;
+@@ -520,6 +931,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+ avbuf->num_planes = 1;
+
+ for (i = 0; i < avbuf->num_planes; i++) {
++ const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
++ (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
+
+ avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+ ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
+@@ -527,25 +940,29 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+
+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+ avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
+- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
+- PROT_READ | PROT_WRITE, MAP_SHARED,
+- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
++
++ if (want_mmap)
++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
++ PROT_READ | PROT_WRITE, MAP_SHARED,
++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+ } else {
+ avbuf->plane_info[i].length = avbuf->buf.length;
+- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
+- PROT_READ | PROT_WRITE, MAP_SHARED,
+- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
++
++ if (want_mmap)
++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
++ PROT_READ | PROT_WRITE, MAP_SHARED,
++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+ }
+
+- if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
+- return AVERROR(ENOMEM);
++ if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
++ avbuf->plane_info[i].mm_addr = NULL;
++ ret = AVERROR(ENOMEM);
++ goto fail;
++ }
+ }
+
+ avbuf->status = V4L2BUF_AVAILABLE;
+
+- if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+- return 0;
+-
+ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+ avbuf->buf.m.planes = avbuf->planes;
+ avbuf->buf.length = avbuf->num_planes;
+@@ -555,20 +972,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+ avbuf->buf.length = avbuf->planes[0].length;
+ }
+
+- return ff_v4l2_buffer_enqueue(avbuf);
++ if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
++ if (buf_to_m2mctx(avbuf)->output_drm) {
++ ret = v4l2_buffer_export_drm(avbuf);
++ if (ret)
++ goto fail;
++ }
++ }
++
++ *pbufref = bufref;
++ return 0;
++
++fail:
++ av_buffer_unref(&bufref);
++ return ret;
+ }
+
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
+ {
+ int ret;
++ int qc;
+
+- avbuf->buf.flags = avbuf->flags;
++ if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
++ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
++ avbuf->context->name, avbuf->buf.index,
++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
++ avbuf->context->q_count);
++ }
+
+ ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
+- if (ret < 0)
+- return AVERROR(errno);
++ if (ret < 0) {
++ int err = errno;
++ av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
++ avbuf->context->name, avbuf->buf.index,
++ err, strerror(err));
++ return AVERROR(err);
++ }
+
++ // Lock not wanted - if called from buffer free then lock already obtained
++ qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
+ avbuf->status = V4L2BUF_IN_DRIVER;
++ pthread_cond_broadcast(&avbuf->context->cond);
++
++ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
++ avbuf->context->name, avbuf->buf.index,
++ avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
+
+ return 0;
+ }
+--- a/libavcodec/v4l2_buffers.h
++++ b/libavcodec/v4l2_buffers.h
+@@ -27,25 +27,38 @@
+ #include <stdatomic.h>
+ #include <linux/videodev2.h>
+
++#include "libavutil/hwcontext_drm.h"
+ #include "avcodec.h"
+
+ enum V4L2Buffer_status {
+ V4L2BUF_AVAILABLE,
+ V4L2BUF_IN_DRIVER,
++ V4L2BUF_IN_USE,
+ V4L2BUF_RET_USER,
+ };
+
+ /**
+ * V4L2Buffer (wrapper for v4l2_buffer management)
+ */
++struct V4L2Context;
++struct ff_weak_link_client;
++
+ typedef struct V4L2Buffer {
+- /* each buffer needs to have a reference to its context */
++ /* each buffer needs to have a reference to its context
++ * The pointer is good enough for most operation but once the buffer has
++ * been passed to the user the buffer may become orphaned so for free ops
++ * the weak link must be used to ensure that the context is actually
++ * there
++ */
+ struct V4L2Context *context;
++ struct ff_weak_link_client *context_wl;
+
+- /* This object is refcounted per-plane, so we need to keep track
+- * of how many context-refs we are holding. */
+- AVBufferRef *context_ref;
+- atomic_uint context_refcount;
++ /* DRM descriptor */
++ AVDRMFrameDescriptor drm_frame;
++ /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
++ * are done
++ */
++ AVBufferRef * ref_buf;
+
+ /* keep track of the mmap address and mmap length */
+ struct V4L2Plane_info {
+@@ -60,7 +73,6 @@ typedef struct V4L2Buffer {
+ struct v4l2_buffer buf;
+ struct v4l2_plane planes[VIDEO_MAX_PLANES];
+
+- int flags;
+ enum V4L2Buffer_status status;
+
+ } V4L2Buffer;
+@@ -98,6 +110,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket
+ */
+ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
+
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++ const void *extdata, size_t extlen,
++ const int64_t timestamp);
++
+ /**
+ * Extracts the data from an AVFrame to a V4L2Buffer
+ *
+@@ -106,7 +122,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AV
+ *
+ * @returns 0 in case of success, a negative AVERROR code otherwise
+ */
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
+
+ /**
+ * Initializes a V4L2Buffer
+@@ -116,7 +132,7 @@ int ff_v4l2_buffer_avframe_to_buf(const
+ *
+ * @returns 0 in case of success, a negative AVERROR code otherwise
+ */
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
+
+ /**
+ * Enqueues a V4L2Buffer
+@@ -127,5 +143,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer
+ */
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
+
++static inline void
++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
++{
++ avbuf->status = V4L2BUF_AVAILABLE;
++ av_buffer_unref(&avbuf->ref_buf);
++}
++
+
+ #endif // AVCODEC_V4L2_BUFFERS_H
+--- a/libavcodec/v4l2_context.c
++++ b/libavcodec/v4l2_context.c
+@@ -27,11 +27,13 @@
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <poll.h>
++#include "libavutil/avassert.h"
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+ #include "v4l2_buffers.h"
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
++#include "weak_link.h"
+
+ struct v4l2_format_update {
+ uint32_t v4l2_fmt;
+@@ -41,26 +43,168 @@ struct v4l2_format_update {
+ int update_avfmt;
+ };
+
+-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
++
++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
+ {
+- return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+- container_of(ctx, V4L2m2mContext, output) :
+- container_of(ctx, V4L2m2mContext, capture);
++ return (int64_t)n;
+ }
+
+-static inline AVCodecContext *logger(V4L2Context *ctx)
++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+ {
+- return ctx_to_m2mctx(ctx)->avctx;
++ return (unsigned int)pts;
+ }
+
+-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
++// FFmpeg requires us to propagate a number of vars from the coded pkt into
++// the decoded frame. The only thing that tracks like that in V4L2 stateful
++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
++// guarantees about PTS being unique or specified for every frame so replace
++// the supplied PTS with a simple incrementing number and keep a circular
++// buffer of all the things we want preserved (including the original PTS)
++// indexed by the tracking no.
++static int64_t
++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
+ {
+- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++ int64_t track_pts;
++
++ // Avoid 0
++ if (++x->track_no == 0)
++ x->track_no = 1;
++
++ track_pts = track_to_pts(avctx, x->track_no);
++
++ av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++ .discard = 0,
++ .pending = 1,
++ .pkt_size = avpkt->size,
++ .pts = avpkt->pts,
++ .dts = avpkt->dts,
++ .reordered_opaque = avctx->reordered_opaque,
++ .pkt_pos = avpkt->pos,
++ .pkt_duration = avpkt->duration,
++ .track_pts = track_pts
++ };
++ return track_pts;
+ }
+
+-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
++static int64_t
++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
+ {
+- return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++ int64_t track_pts;
++
++ // Avoid 0
++ if (++x->track_no == 0)
++ x->track_no = 1;
++
++ track_pts = track_to_pts(avctx, x->track_no);
++
++ av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
++ x->track_els[x->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++ .discard = 0,
++ .pending = 1,
++ .pkt_size = 0,
++ .pts = frame->pts,
++ .dts = AV_NOPTS_VALUE,
++ .reordered_opaque = frame->reordered_opaque,
++ .pkt_pos = frame->pkt_pos,
++ .pkt_duration = frame->pkt_duration,
++ .track_pts = track_pts
++ };
++ return track_pts;
++}
++
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_frame_out(AVCodecContext *const avctx,
++ xlat_track_t * const x,
++ AVFrame *const frame)
++{
++ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
++ V4L2m2mTrackEl *const t = x->track_els + n;
++ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
++ {
++ av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++ "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++ frame->pts = AV_NOPTS_VALUE;
++ frame->pkt_dts = AV_NOPTS_VALUE;
++ frame->reordered_opaque = x->last_opaque;
++ frame->pkt_pos = -1;
++ frame->pkt_duration = 0;
++ frame->pkt_size = -1;
++ }
++ else if (!t->discard)
++ {
++ frame->pts = t->pending ? t->pts : AV_NOPTS_VALUE;
++ frame->pkt_dts = t->dts;
++ frame->reordered_opaque = t->reordered_opaque;
++ frame->pkt_pos = t->pkt_pos;
++ frame->pkt_duration = t->pkt_duration;
++ frame->pkt_size = t->pkt_size;
++
++ x->last_opaque = x->track_els[n].reordered_opaque;
++ if (frame->pts != AV_NOPTS_VALUE)
++ x->last_pts = frame->pts;
++ t->pending = 0;
++ }
++ else
++ {
++ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++ return -1;
++ }
++
++ av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
++ return 0;
++}
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_pkt_out(AVCodecContext *const avctx,
++ xlat_track_t * const x,
++ AVPacket *const pkt)
++{
++ unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
++ V4L2m2mTrackEl *const t = x->track_els + n;
++ if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
++ {
++ av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++ "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++ pkt->pts = AV_NOPTS_VALUE;
++ }
++ else if (!t->discard)
++ {
++ pkt->pts = t->pending ? t->pts : AV_NOPTS_VALUE;
++
++ x->last_opaque = x->track_els[n].reordered_opaque;
++ if (pkt->pts != AV_NOPTS_VALUE)
++ x->last_pts = pkt->pts;
++ t->pending = 0;
++ }
++ else
++ {
++ av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++ return -1;
++ }
++
++ // * Would like something much better than this...xlat(offset + out_count)?
++ pkt->dts = pkt->pts;
++ av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++ pkt->pts, t->track_pts, n);
++ return 0;
++}
++
++
++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
++{
++ return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
++ container_of(ctx, V4L2m2mContext, output) :
++ container_of(ctx, V4L2m2mContext, capture);
++}
++
++static inline AVCodecContext *logger(const V4L2Context *ctx)
++{
++ return ctx_to_m2mctx(ctx)->avctx;
+ }
+
+ static AVRational v4l2_get_sar(V4L2Context *ctx)
+@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Conte
+ return sar;
+ }
+
+-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
++static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
++{
++ return ctx->bufrefs != NULL;
++}
++
++// Width/Height changed or we don't have an alloc in the first place?
++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
+ {
+- struct v4l2_format *fmt1 = &ctx->format;
+- int ret = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+- fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+- fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+- :
+- fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+- fmt1->fmt.pix.height != fmt2->fmt.pix.height;
++ const struct v4l2_format *fmt1 = &ctx->format;
++ int ret = !ctx_buffers_alloced(ctx) ||
++ (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
++ fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
++ fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
++ :
++ fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
++ fmt1->fmt.pix.height != fmt2->fmt.pix.height);
+
+ if (ret)
+- av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
++ av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
+ ctx->name,
+- v4l2_get_width(fmt1), v4l2_get_height(fmt1),
+- v4l2_get_width(fmt2), v4l2_get_height(fmt2));
++ ctx_buffers_alloced(ctx),
++ ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
++ ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
+
+ return ret;
+ }
+@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(
+ }
+ }
+
+-/**
+- * handle resolution change event and end of stream event
+- * returns 1 if reinit was successful, negative if it failed
+- * returns 0 if reinit was not executed
+- */
+-static int v4l2_handle_event(V4L2Context *ctx)
++static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
+ {
+- V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+- struct v4l2_format cap_fmt = s->capture.format;
+- struct v4l2_format out_fmt = s->output.format;
+- struct v4l2_event evt = { 0 };
+- int full_reinit, reinit, ret;
++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
++ struct v4l2_selection selection = {
++ .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
++ .target = V4L2_SEL_TGT_COMPOSE
++ };
+
+- ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
+- if (ret < 0) {
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
+- return 0;
+- }
++ memset(r, 0, sizeof(*r));
++ if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
++ return AVERROR(errno);
+
+- if (evt.type == V4L2_EVENT_EOS) {
+- ctx->done = 1;
+- return 0;
+- }
++ *r = selection.r;
++ return 0;
++}
+
+- if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
+- return 0;
++static int do_source_change(V4L2m2mContext * const s)
++{
++ AVCodecContext *const avctx = s->avctx;
+
+- ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
+- if (ret) {
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
+- return 0;
+- }
++ int ret;
++ int reinit;
++ struct v4l2_format cap_fmt = s->capture.format;
++
++ s->capture.done = 0;
+
+ ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
+ if (ret) {
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
++ av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
+ return 0;
+ }
+
+- full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
+- if (full_reinit) {
+- s->output.height = v4l2_get_height(&out_fmt);
+- s->output.width = v4l2_get_width(&out_fmt);
+- s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
+- }
++ get_default_selection(&s->capture, &s->capture.selection);
++
++ reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
++ if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
++ reinit = 1;
+
+- reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
++ s->capture.format = cap_fmt;
+ if (reinit) {
+- s->capture.height = v4l2_get_height(&cap_fmt);
+- s->capture.width = v4l2_get_width(&cap_fmt);
+- s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
++ s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
++ s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
+ }
+
+- if (full_reinit || reinit)
+- s->reinit = 1;
+-
+- if (full_reinit) {
+- ret = ff_v4l2_m2m_codec_full_reinit(s);
+- if (ret) {
+- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
+- return AVERROR(EINVAL);
+- }
+- goto reinit_run;
++ // If we don't support selection (or it is bust) and we obviously have HD then kludge
++ if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
++ (s->capture.height == 1088 && s->capture.width == 1920)) {
++ s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
+ }
+
++ s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
++
++ av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
++ s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
++ s->capture.width, s->capture.height,
++ s->capture.selection.width, s->capture.selection.height,
++ s->capture.selection.left, s->capture.selection.top, reinit);
++
+ if (reinit) {
+- if (s->avctx)
+- ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
++ if (avctx)
++ ret = ff_set_dimensions(s->avctx,
++ s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
++ s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
+ if (ret < 0)
+- av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
++ av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
+
+ ret = ff_v4l2_m2m_codec_reinit(s);
+ if (ret) {
+- av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
++ av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
+ return AVERROR(EINVAL);
+ }
++
++ if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
++ s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
++ av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
++ s->capture.width, s->capture.height,
++ ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
++ return AVERROR(EINVAL);
++ }
++
++ // Update pixel format - should only actually do something on initial change
++ s->capture.av_pix_fmt =
++ ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
++ if (s->output_drm) {
++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++ avctx->sw_pix_fmt = s->capture.av_pix_fmt;
++ }
++ else
++ avctx->pix_fmt = s->capture.av_pix_fmt;
++
+ goto reinit_run;
+ }
+
+- /* dummy event received */
+- return 0;
++ /* Buffers are OK so just stream off to ack */
++ av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
++
++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++ if (ret)
++ av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
++ s->draining = 0;
+
+ /* reinit executed */
+ reinit_run:
++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
+ return 1;
+ }
+
+@@ -280,171 +452,277 @@ static int v4l2_stop_encode(V4L2Context
+ return 0;
+ }
+
+-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
+-{
+- struct v4l2_plane planes[VIDEO_MAX_PLANES];
+- struct v4l2_buffer buf = { 0 };
+- V4L2Buffer *avbuf;
+- struct pollfd pfd = {
+- .events = POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
+- .fd = ctx_to_m2mctx(ctx)->fd,
++// DQ a buffer
++// Amalgamates all the various ways there are of signalling EOS/Event to
++// generate a consistant EPIPE.
++//
++// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
++//
++// Returns:
++// 0 Success
++// AVERROR(EPIPE) Nothing more to read
++// AVERROR(ENOSPC) No buffers in Q to put result in
++// * AVERROR(..)
++
++ static int
++dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
++{
++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++ AVCodecContext * const avctx = m->avctx;
++ V4L2Buffer * avbuf;
++ const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
++
++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++
++ struct v4l2_buffer buf = {
++ .type = ctx->type,
++ .memory = V4L2_MEMORY_MMAP,
+ };
+- int i, ret;
+
+- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
+- for (i = 0; i < ctx->num_buffers; i++) {
+- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+- break;
+- }
+- if (i == ctx->num_buffers)
+- av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
+- "userspace. Increase num_capture_buffers "
+- "to prevent device deadlock or dropped "
+- "packets/frames.\n");
+- }
+-
+- /* if we are draining and there are no more capture buffers queued in the driver we are done */
+- if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
+- for (i = 0; i < ctx->num_buffers; i++) {
+- /* capture buffer initialization happens during decode hence
+- * detection happens at runtime
+- */
+- if (!ctx->buffers)
+- break;
+-
+- if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+- goto start;
+- }
+- ctx->done = 1;
+- return NULL;
+- }
+-
+-start:
+- if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+- pfd.events = POLLOUT | POLLWRNORM;
+- else {
+- /* no need to listen to requests for more input while draining */
+- if (ctx_to_m2mctx(ctx)->draining)
+- pfd.events = POLLIN | POLLRDNORM | POLLPRI;
++ *ppavbuf = NULL;
++
++ if (ctx->flag_last)
++ return AVERROR(EPIPE);
++
++ if (is_mp) {
++ buf.length = VIDEO_MAX_PLANES;
++ buf.m.planes = planes;
+ }
+
+- for (;;) {
+- ret = poll(&pfd, 1, timeout);
+- if (ret > 0)
+- break;
+- if (errno == EINTR)
+- continue;
+- return NULL;
++ while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
++ const int err = errno;
++ av_assert0(AVERROR(err) < 0);
++ if (err != EINTR) {
++ av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
++ ctx->name, av_err2str(AVERROR(err)));
++
++ if (err == EPIPE)
++ ctx->flag_last = 1;
++
++ return AVERROR(err);
++ }
+ }
++ atomic_fetch_sub(&ctx->q_count, 1);
+
+- /* 0. handle errors */
+- if (pfd.revents & POLLERR) {
+- /* if we are trying to get free buffers but none have been queued yet
+- no need to raise a warning */
+- if (timeout == 0) {
+- for (i = 0; i < ctx->num_buffers; i++) {
+- if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
+- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+- }
++ avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
++ ff_v4l2_buffer_set_avail(avbuf);
++ avbuf->buf = buf;
++ if (is_mp) {
++ memcpy(avbuf->planes, planes, sizeof(planes));
++ avbuf->buf.m.planes = avbuf->planes;
++ }
++ // Done with any attached buffer
++ av_buffer_unref(&avbuf->ref_buf);
++
++ if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
++ // Zero length cap buffer return == EOS
++ if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
++
++ // Must reQ so we don't leak
++ // May not matter if the next thing we do is release all the
++ // buffers but better to be tidy.
++ ff_v4l2_buffer_enqueue(avbuf);
++
++ ctx->flag_last = 1;
++ return AVERROR(EPIPE);
+ }
+- else
+- av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+
+- return NULL;
++#ifdef V4L2_BUF_FLAG_LAST
++ // If flag_last set then this contains data but is the last frame
++ // so remember that but return OK
++ if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
++ ctx->flag_last = 1;
++#endif
+ }
+
+- /* 1. handle resolution changes */
+- if (pfd.revents & POLLPRI) {
+- ret = v4l2_handle_event(ctx);
+- if (ret < 0) {
+- /* if re-init failed, abort */
+- ctx->done = 1;
+- return NULL;
+- }
+- if (ret) {
+- /* if re-init was successful drop the buffer (if there was one)
+- * since we had to reconfigure capture (unmap all buffers)
+- */
+- return NULL;
++ *ppavbuf = avbuf;
++ return 0;
++}
++
++/**
++ * handle resolution change event and end of stream event
++ * Expects to be called after the stream has stopped
++ *
++ * returns 1 if reinit was successful, negative if it failed
++ * returns 0 if reinit was not executed
++ */
++static int
++get_event(V4L2m2mContext * const m)
++{
++ AVCodecContext * const avctx = m->avctx;
++ struct v4l2_event evt = { 0 };
++
++ while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
++ const int rv = AVERROR(errno);
++ if (rv == AVERROR(EINTR))
++ continue;
++ if (rv == AVERROR(EAGAIN)) {
++ av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
++ return AVERROR_EOF;
+ }
++ av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
++ return rv;
+ }
+
+- /* 2. dequeue the buffer */
+- if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
++ av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
+
+- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+- /* there is a capture buffer ready */
+- if (pfd.revents & (POLLIN | POLLRDNORM))
+- goto dequeue;
++ if (evt.type == V4L2_EVENT_EOS) {
++ av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
++ return AVERROR_EOF;
++ }
++
++ if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
++ return do_source_change(m);
++
++ return 0;
++}
++
++
++// Get a buffer
++// If output then just gets the buffer in the expected way
++// If capture then runs the capture state m/c to deal with res change etc.
++// If return value == 0 then *ppavbuf != NULL
++
++static int
++get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
++{
++ V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++ AVCodecContext * const avctx = m->avctx;
++ const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
++
++ const unsigned int poll_cap = (POLLIN | POLLRDNORM);
++ const unsigned int poll_out = (POLLOUT | POLLWRNORM);
++ const unsigned int poll_event = POLLPRI;
++
++ *ppavbuf = NULL;
+
+- /* the driver is ready to accept more input; instead of waiting for the capture
+- * buffer to complete we return NULL so input can proceed (we are single threaded)
+- */
+- if (pfd.revents & (POLLOUT | POLLWRNORM))
+- return NULL;
++ for (;;) {
++ struct pollfd pfd = {
++ .fd = m->fd,
++ // If capture && stream not started then assume we are waiting for the initial event
++ .events = !is_cap ? poll_out :
++ !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
++ poll_event,
++ };
++ int ret;
++
++ if (ctx->done) {
++ av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
++ return AVERROR_EOF;
+ }
+
+-dequeue:
+- memset(&buf, 0, sizeof(buf));
+- buf.memory = V4L2_MEMORY_MMAP;
+- buf.type = ctx->type;
+- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+- memset(planes, 0, sizeof(planes));
+- buf.length = VIDEO_MAX_PLANES;
+- buf.m.planes = planes;
++ // If capture && timeout == -1 then also wait for rx buffer free
++ if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
++ pfd.events |= poll_out;
++
++ // If nothing Qed all we will get is POLLERR - avoid that
++ if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
++ (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
++ (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
++ return AVERROR(ENOSPC);
+ }
+
+- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
+- if (ret) {
+- if (errno != EAGAIN) {
+- ctx->done = 1;
+- if (errno != EPIPE)
+- av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+- ctx->name, av_err2str(AVERROR(errno)));
++ // Timeout kludged s.t. "forever" eventually gives up & produces logging
++ // If waiting for an event when we have seen a last_frame then we expect
++ // it to be ready already so force a short timeout
++ ret = poll(&pfd, 1,
++ ff_v4l2_ctx_eos(ctx) ? 10 :
++ timeout == -1 ? 3000 : timeout);
++ if (ret < 0) {
++ ret = AVERROR(errno); // Remember errno before logging etc.
++ av_assert0(ret < 0);
++ }
++
++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
++ ctx->name, ret, timeout, pfd.events, pfd.revents);
++
++ if (ret < 0) {
++ if (ret == AVERROR(EINTR))
++ continue;
++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
++ return ret;
++ }
++
++ if (ret == 0) {
++ if (timeout == -1)
++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
++ if (ff_v4l2_ctx_eos(ctx)) {
++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
++ ret = get_event(m);
++ if (ret < 0) {
++ ctx->done = 1;
++ return ret;
++ }
+ }
+- return NULL;
++ return AVERROR(EAGAIN);
+ }
+
+- if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+- int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
+- buf.m.planes[0].bytesused : buf.bytesused;
+- if (bytesused == 0) {
++ if ((pfd.revents & POLLERR) != 0) {
++ av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
++ return AVERROR_UNKNOWN;
++ }
++
++ if ((pfd.revents & poll_event) != 0) {
++ ret = get_event(m);
++ if (ret < 0) {
+ ctx->done = 1;
+- return NULL;
++ return ret;
+ }
+-#ifdef V4L2_BUF_FLAG_LAST
+- if (buf.flags & V4L2_BUF_FLAG_LAST)
+- ctx->done = 1;
+-#endif
++ continue;
++ }
++
++ if ((pfd.revents & poll_cap) != 0) {
++ ret = dq_buf(ctx, ppavbuf);
++ if (ret == AVERROR(EPIPE))
++ continue;
++ return ret;
+ }
+
+- avbuf = &ctx->buffers[buf.index];
+- avbuf->status = V4L2BUF_AVAILABLE;
+- avbuf->buf = buf;
+- if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+- memcpy(avbuf->planes, planes, sizeof(planes));
+- avbuf->buf.m.planes = avbuf->planes;
++ if ((pfd.revents & poll_out) != 0) {
++ if (is_cap)
++ return AVERROR(EAGAIN);
++ return dq_buf(ctx, ppavbuf);
+ }
+- return avbuf;
++
++ av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
++ return AVERROR_UNKNOWN;
+ }
++}
+
+- return NULL;
++// Clear out flags and timestamps that should should be set by the user
++// Returns the passed avbuf
++static V4L2Buffer *
++clean_v4l2_buffer(V4L2Buffer * const avbuf)
++{
++ struct v4l2_buffer *const buf = &avbuf->buf;
++
++ buf->flags = 0;
++ buf->field = V4L2_FIELD_ANY;
++ buf->timestamp = (struct timeval){0};
++ buf->timecode = (struct v4l2_timecode){0};
++ buf->sequence = 0;
++
++ return avbuf;
+ }
+
+ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+ {
+- int timeout = 0; /* return when no more buffers to dequeue */
+ int i;
+
+ /* get back as many output buffers as possible */
+ if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+- do {
+- } while (v4l2_dequeue_v4l2buf(ctx, timeout));
++ V4L2Buffer * avbuf;
++ do {
++ get_qbuf(ctx, &avbuf, 0);
++ } while (avbuf);
+ }
+
+ for (i = 0; i < ctx->num_buffers; i++) {
+- if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
+- return &ctx->buffers[i];
++ V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
++ if (avbuf->status == V4L2BUF_AVAILABLE)
++ return clean_v4l2_buffer(avbuf);
+ }
+
+ return NULL;
+@@ -452,25 +730,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(
+
+ static int v4l2_release_buffers(V4L2Context* ctx)
+ {
+- struct v4l2_requestbuffers req = {
+- .memory = V4L2_MEMORY_MMAP,
+- .type = ctx->type,
+- .count = 0, /* 0 -> unmaps buffers from the driver */
+- };
+- int i, j;
++ int i;
++ int ret = 0;
++ const int fd = ctx_to_m2mctx(ctx)->fd;
+
+- for (i = 0; i < ctx->num_buffers; i++) {
+- V4L2Buffer *buffer = &ctx->buffers[i];
++ // Orphan any buffers in the wild
++ ff_weak_link_break(&ctx->wl_master);
+
+- for (j = 0; j < buffer->num_planes; j++) {
+- struct V4L2Plane_info *p = &buffer->plane_info[j];
+- if (p->mm_addr && p->length)
+- if (munmap(p->mm_addr, p->length) < 0)
+- av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
++ if (ctx->bufrefs) {
++ for (i = 0; i < ctx->num_buffers; i++)
++ av_buffer_unref(ctx->bufrefs + i);
++ }
++
++ if (fd != -1) {
++ struct v4l2_requestbuffers req = {
++ .memory = V4L2_MEMORY_MMAP,
++ .type = ctx->type,
++ .count = 0, /* 0 -> unmap all buffers from the driver */
++ };
++
++ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
++ if (errno == EINTR)
++ continue;
++
++ ret = AVERROR(errno);
++
++ av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
++ ctx->name, av_err2str(AVERROR(errno)));
++
++ if (ctx_to_m2mctx(ctx)->output_drm)
++ av_log(logger(ctx), AV_LOG_ERROR,
++ "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
++ "for all buffers: \n"
++ " 1. drmModeRmFB(..)\n"
++ " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
+ }
+ }
++ atomic_store(&ctx->q_count, 0);
+
+- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
++ return ret;
+ }
+
+ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
+@@ -499,6 +797,8 @@ static inline int v4l2_try_raw_format(V4
+
+ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+ {
++ V4L2m2mContext* s = ctx_to_m2mctx(ctx);
++ V4L2m2mPriv *priv = s->avctx->priv_data;
+ enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
+ struct v4l2_fmtdesc fdesc;
+ int ret;
+@@ -517,6 +817,13 @@ static int v4l2_get_raw_format(V4L2Conte
+ if (ret)
+ return AVERROR(EINVAL);
+
++ if (priv->pix_fmt != AV_PIX_FMT_NONE) {
++ if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
++ fdesc.index++;
++ continue;
++ }
++ }
++
+ pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
+ ret = v4l2_try_raw_format(ctx, pixfmt);
+ if (ret){
+@@ -569,30 +876,99 @@ static int v4l2_get_coded_format(V4L2Con
+ *
+ *****************************************************************************/
+
++
++static void flush_all_buffers_status(V4L2Context* const ctx)
++{
++ int i;
++
++ if (!ctx->bufrefs)
++ return;
++
++ for (i = 0; i < ctx->num_buffers; ++i) {
++ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
++ if (buf->status == V4L2BUF_IN_DRIVER)
++ ff_v4l2_buffer_set_avail(buf);
++ }
++ atomic_store(&ctx->q_count, 0);
++}
++
++static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
++{
++ int i;
++ int rv;
++
++ if (!ctx->bufrefs) {
++ rv = ff_v4l2_context_init(ctx);
++ if (rv) {
++ av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
++ return rv;
++ }
++ }
++
++ for (i = 0; i < ctx->num_buffers; ++i) {
++ struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
++ if (buf->status == V4L2BUF_AVAILABLE) {
++ rv = ff_v4l2_buffer_enqueue(buf);
++ if (rv < 0)
++ return rv;
++ }
++ }
++ return 0;
++}
++
+ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
+ {
+ int type = ctx->type;
+- int ret;
++ int ret = 0;
++ AVCodecContext * const avctx = logger(ctx);
+
+- ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
+- if (ret < 0)
+- return AVERROR(errno);
++ // Avoid doing anything if there is nothing we can do
++ if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
++ return 0;
+
+- ctx->streamon = (cmd == VIDIOC_STREAMON);
++ ff_mutex_lock(&ctx->lock);
+
+- return 0;
++ if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
++ stuff_all_buffers(avctx, ctx);
++
++ if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
++ const int err = errno;
++ av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
++ ret = AVERROR(err);
++ }
++ else
++ {
++ if (cmd == VIDIOC_STREAMOFF)
++ flush_all_buffers_status(ctx);
++ else
++ ctx->first_buf = 1;
++
++ ctx->streamon = (cmd == VIDIOC_STREAMON);
++ av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
++ }
++
++ // Both stream off & on effectively clear flag_last
++ ctx->flag_last = 0;
++
++ ff_mutex_unlock(&ctx->lock);
++
++ return ret;
+ }
+
+ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+ {
+- V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
++ AVCodecContext *const avctx = s->avctx;
++ int64_t track_ts;
+ V4L2Buffer* avbuf;
+ int ret;
+
+ if (!frame) {
+ ret = v4l2_stop_encode(ctx);
+ if (ret)
+- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
++ av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+ s->draining= 1;
+ return 0;
+ }
+@@ -601,23 +977,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Co
+ if (!avbuf)
+ return AVERROR(ENOMEM);
+
+- ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
++ track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
++
++ ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
+ if (ret)
+ return ret;
+
+ return ff_v4l2_buffer_enqueue(avbuf);
+ }
+
+-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
++ const void * extdata, size_t extlen)
+ {
+ V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ AVCodecContext *const avctx = s->avctx;
+ V4L2Buffer* avbuf;
+ int ret;
++ int64_t track_ts;
+
+ if (!pkt->size) {
+ ret = v4l2_stop_decode(ctx);
++ // Log but otherwise ignore stop failure
+ if (ret)
+- av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
++ av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
+ s->draining = 1;
+ return 0;
+ }
+@@ -626,8 +1008,13 @@ int ff_v4l2_context_enqueue_packet(V4L2C
+ if (!avbuf)
+ return AVERROR(EAGAIN);
+
+- ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
+- if (ret)
++ track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
++
++ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
++ if (ret == AVERROR(ENOMEM))
++ av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
++ __func__, pkt->size, avbuf->planes[0].length);
++ else if (ret)
+ return ret;
+
+ return ff_v4l2_buffer_enqueue(avbuf);
+@@ -635,42 +1022,36 @@ int ff_v4l2_context_enqueue_packet(V4L2C
+
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
+ {
++ V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ AVCodecContext *const avctx = s->avctx;
+ V4L2Buffer *avbuf;
++ int rv;
+
+- /*
+- * timeout=-1 blocks until:
+- * 1. decoded frame available
+- * 2. an input buffer is ready to be dequeued
+- */
+- avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
+- if (!avbuf) {
+- if (ctx->done)
+- return AVERROR_EOF;
+-
+- return AVERROR(EAGAIN);
+- }
++ do {
++ if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
++ return rv;
++ if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
++ return rv;
++ } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
+
+- return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
++ return 0;
+ }
+
+ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
+ {
++ V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ AVCodecContext *const avctx = s->avctx;
+ V4L2Buffer *avbuf;
++ int rv;
+
+- /*
+- * blocks until:
+- * 1. encoded packet available
+- * 2. an input buffer ready to be dequeued
+- */
+- avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
+- if (!avbuf) {
+- if (ctx->done)
+- return AVERROR_EOF;
++ do {
++ if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
++ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC
++ if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
++ return rv;
++ } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
+
+- return AVERROR(EAGAIN);
+- }
+-
+- return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
++ return 0;
+ }
+
+ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -702,78 +1083,160 @@ int ff_v4l2_context_get_format(V4L2Conte
+
+ int ff_v4l2_context_set_format(V4L2Context* ctx)
+ {
+- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++ int ret;
++
++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++ if (ret != 0)
++ return ret;
++
++ // Check returned size against min size and if smaller have another go
++ // Only worry about plane[0] as this is meant to enforce limits for
++ // encoded streams where we might know a bit more about the shape
++ // than the driver
++ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
++ if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
++ return 0;
++ ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
++ }
++ else {
++ if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
++ return 0;
++ ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
++ }
++
++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++ return ret;
+ }
+
+ void ff_v4l2_context_release(V4L2Context* ctx)
+ {
+ int ret;
+
+- if (!ctx->buffers)
++ if (!ctx->bufrefs)
+ return;
+
+ ret = v4l2_release_buffers(ctx);
+ if (ret)
+ av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
+
+- av_freep(&ctx->buffers);
++ av_freep(&ctx->bufrefs);
++ av_buffer_unref(&ctx->frames_ref);
++
++ ff_mutex_destroy(&ctx->lock);
++ pthread_cond_destroy(&ctx->cond);
+ }
+
+-int ff_v4l2_context_init(V4L2Context* ctx)
++
++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
+ {
+- V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+ struct v4l2_requestbuffers req;
+- int ret, i;
+-
+- if (!v4l2_type_supported(ctx)) {
+- av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
+- return AVERROR_PATCHWELCOME;
+- }
++ int ret;
++ int i;
+
+- ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
+- if (ret)
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
++ av_assert0(ctx->bufrefs == NULL);
+
+ memset(&req, 0, sizeof(req));
+- req.count = ctx->num_buffers;
+- req.memory = V4L2_MEMORY_MMAP;
++ req.count = req_buffers;
++ req.memory = mem;
+ req.type = ctx->type;
+- ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
+- if (ret < 0) {
+- av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
+- return AVERROR(errno);
++ while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
++ if (errno != EINTR) {
++ ret = AVERROR(errno);
++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
++ return ret;
++ }
+ }
+
+ ctx->num_buffers = req.count;
+- ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
+- if (!ctx->buffers) {
++ ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
++ if (!ctx->bufrefs) {
+ av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
+- return AVERROR(ENOMEM);
++ goto fail_release;
+ }
+
+- for (i = 0; i < req.count; i++) {
+- ctx->buffers[i].context = ctx;
+- ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
+- if (ret < 0) {
++ ctx->wl_master = ff_weak_link_new(ctx);
++ if (!ctx->wl_master) {
++ ret = AVERROR(ENOMEM);
++ goto fail_release;
++ }
++
++ for (i = 0; i < ctx->num_buffers; i++) {
++ ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
++ if (ret) {
+ av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
+- goto error;
++ goto fail_release;
+ }
+ }
+
+ av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
+ V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
+ req.count,
+- v4l2_get_width(&ctx->format),
+- v4l2_get_height(&ctx->format),
++ ff_v4l2_get_format_width(&ctx->format),
++ ff_v4l2_get_format_height(&ctx->format),
+ V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
+ V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
+
+ return 0;
+
+-error:
++fail_release:
+ v4l2_release_buffers(ctx);
++ av_freep(&ctx->bufrefs);
++ return ret;
++}
++
++int ff_v4l2_context_init(V4L2Context* ctx)
++{
++ V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
++ int ret;
++
++ // It is not valid to reinit a context without a previous release
++ av_assert0(ctx->bufrefs == NULL);
+
+- av_freep(&ctx->buffers);
++ if (!v4l2_type_supported(ctx)) {
++ av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
++ return AVERROR_PATCHWELCOME;
++ }
++
++ ff_mutex_init(&ctx->lock, NULL);
++ pthread_cond_init(&ctx->cond, NULL);
++ atomic_init(&ctx->q_count, 0);
++
++ if (s->output_drm) {
++ AVHWFramesContext *hwframes;
++
++ ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
++ if (!ctx->frames_ref) {
++ ret = AVERROR(ENOMEM);
++ goto fail_unlock;
++ }
++
++ hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
++ hwframes->format = AV_PIX_FMT_DRM_PRIME;
++ hwframes->sw_format = ctx->av_pix_fmt;
++ hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
++ hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
++ ret = av_hwframe_ctx_init(ctx->frames_ref);
++ if (ret < 0)
++ goto fail_unref_hwframes;
++ }
++
++ ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
++ if (ret) {
++ ret = AVERROR(errno);
++ av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
++ goto fail_unref_hwframes;
++ }
++
++ ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
++ if (ret < 0)
++ goto fail_unref_hwframes;
++
++ return 0;
+
++fail_unref_hwframes:
++ av_buffer_unref(&ctx->frames_ref);
++fail_unlock:
++ ff_mutex_destroy(&ctx->lock);
+ return ret;
+ }
+--- a/libavcodec/v4l2_context.h
++++ b/libavcodec/v4l2_context.h
+@@ -31,6 +31,7 @@
+ #include "libavutil/pixfmt.h"
+ #include "libavutil/frame.h"
+ #include "libavutil/buffer.h"
++#include "libavutil/thread.h"
+ #include "v4l2_buffers.h"
+
+ typedef struct V4L2Context {
+@@ -70,11 +71,18 @@ typedef struct V4L2Context {
+ */
+ int width, height;
+ AVRational sample_aspect_ratio;
++ struct v4l2_rect selection;
+
+ /**
+- * Indexed array of V4L2Buffers
++ * If the default size of buffer is less than this then try to
++ * set to this.
+ */
+- V4L2Buffer *buffers;
++ uint32_t min_buf_size;
++
++ /**
++ * Indexed array of pointers to V4L2Buffers
++ */
++ AVBufferRef **bufrefs;
+
+ /**
+ * Readonly after init.
+@@ -82,16 +90,38 @@ typedef struct V4L2Context {
+ int num_buffers;
+
+ /**
++ * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
++ */
++ enum v4l2_memory buf_mem;
++
++ /**
+ * Whether the stream has been started (VIDIOC_STREAMON has been sent).
+ */
+ int streamon;
+
++ /* 1st buffer after stream on */
++ int first_buf;
++
+ /**
+ * Either no more buffers available or an unrecoverable error was notified
+ * by the V4L2 kernel driver: once set the context has to be exited.
+ */
+ int done;
+
++ int flag_last;
++
++ /**
++ * If NZ then when Qing frame/pkt use this rather than the
++ * "real" PTS
++ */
++ uint64_t track_ts;
++
++ AVBufferRef *frames_ref;
++ atomic_int q_count;
++ struct ff_weak_link_master *wl_master;
++
++ AVMutex lock;
++ pthread_cond_t cond;
+ } V4L2Context;
+
+ /**
+@@ -156,7 +186,10 @@ int ff_v4l2_context_dequeue_packet(V4L2C
+ * @param[in] ctx The V4L2Context to dequeue from.
+ * @param[inout] f The AVFrame to dequeue to.
+ * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
++ *
+ * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
++ * AVERROR(ENOSPC) if no buffer availible to put
++ * the frame in
+ */
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+
+@@ -170,7 +203,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Co
+ * @param[in] pkt A pointer to an AVPacket.
+ * @return 0 in case of success, a negative error otherwise.
+ */
+-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
+
+ /**
+ * Enqueues a buffer to a V4L2Context from an AVFrame
+--- a/libavcodec/v4l2_m2m.c
++++ b/libavcodec/v4l2_m2m.c
+@@ -36,6 +36,14 @@
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
+
++static void
++xlat_init(xlat_track_t * const x)
++{
++ memset(x, 0, sizeof(*x));
++ x->last_pts = AV_NOPTS_VALUE;
++}
++
++
+ static inline int v4l2_splane_video(struct v4l2_capability *cap)
+ {
+ if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
+@@ -68,7 +76,9 @@ static int v4l2_prepare_contexts(V4L2m2m
+
+ s->capture.done = s->output.done = 0;
+ s->capture.name = "capture";
++ s->capture.buf_mem = V4L2_MEMORY_MMAP;
+ s->output.name = "output";
++ s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+ atomic_init(&s->refcount, 0);
+ sem_init(&s->refsync, 0, 0);
+
+@@ -215,13 +225,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+ av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
+
+ /* 2. unmap the capture buffers (v4l2 and ffmpeg):
+- * we must wait for all references to be released before being allowed
+- * to queue new buffers.
+ */
+- av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
+- if (atomic_load(&s->refcount))
+- while(sem_wait(&s->refsync) == -1 && errno == EINTR);
+-
+ ff_v4l2_context_release(&s->capture);
+
+ /* 3. get the new capture format */
+@@ -240,7 +244,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+
+ /* 5. complete reinit */
+ s->draining = 0;
+- s->reinit = 0;
+
+ return 0;
+ }
+@@ -274,7 +277,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2
+
+ /* start again now that we know the stream dimensions */
+ s->draining = 0;
+- s->reinit = 0;
+
+ ret = ff_v4l2_context_get_format(&s->output, 0);
+ if (ret) {
+@@ -328,7 +330,13 @@ static void v4l2_m2m_destroy_context(voi
+ ff_v4l2_context_release(&s->capture);
+ sem_destroy(&s->refsync);
+
+- close(s->fd);
++ if (s->fd != -1)
++ close(s->fd);
++
++ av_packet_unref(&s->buf_pkt);
++ av_freep(&s->extdata_data);
++
++ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
+
+ av_free(s);
+ }
+@@ -338,17 +346,34 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *p
+ V4L2m2mContext *s = priv->context;
+ int ret;
+
+- ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
+- if (ret)
+- av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
++ if (!s)
++ return 0;
+
+- ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+- if (ret)
+- av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
++ av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
++
++ if (av_codec_is_decoder(s->avctx->codec))
++ av_packet_unref(&s->buf_pkt);
++
++ if (s->fd >= 0) {
++ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
++ if (ret)
++ av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
++
++ ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++ if (ret)
++ av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
++ }
+
+ ff_v4l2_context_release(&s->output);
+
++ close(s->fd);
++ s->fd = -1;
++
+ s->self_ref = NULL;
++ // This is only called on avctx close so after this point we don't have that
++ // Crash sooner if we find we are using it (can still log with avctx = NULL)
++ s->avctx = NULL;
++ priv->context = NULL;
+ av_buffer_unref(&priv->context_ref);
+
+ return 0;
+@@ -392,28 +417,33 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *
+ return v4l2_configure_contexts(s);
+ }
+
+-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
+ {
+- *s = av_mallocz(sizeof(V4L2m2mContext));
+- if (!*s)
++ V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
++
++ *pps = NULL;
++ if (!s)
+ return AVERROR(ENOMEM);
+
+- priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
++ priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
+ &v4l2_m2m_destroy_context, NULL, 0);
+ if (!priv->context_ref) {
+- av_freep(s);
++ av_free(s);
+ return AVERROR(ENOMEM);
+ }
+
+ /* assign the context */
+- priv->context = *s;
+- (*s)->priv = priv;
++ priv->context = s;
++ s->priv = priv;
+
+ /* populate it */
+- priv->context->capture.num_buffers = priv->num_capture_buffers;
+- priv->context->output.num_buffers = priv->num_output_buffers;
+- priv->context->self_ref = priv->context_ref;
+- priv->context->fd = -1;
++ s->capture.num_buffers = priv->num_capture_buffers;
++ s->output.num_buffers = priv->num_output_buffers;
++ s->self_ref = priv->context_ref;
++ s->fd = -1;
++
++ xlat_init(&s->xlat);
+
++ *pps = s;
+ return 0;
+ }
+--- a/libavcodec/v4l2_m2m.h
++++ b/libavcodec/v4l2_m2m.h
+@@ -30,6 +30,7 @@
+ #include <linux/videodev2.h>
+
+ #include "libavcodec/avcodec.h"
++#include "libavutil/pixfmt.h"
+ #include "v4l2_context.h"
+
+ #define container_of(ptr, type, member) ({ \
+@@ -38,7 +39,37 @@
+
+ #define V4L_M2M_DEFAULT_OPTS \
+ { "num_output_buffers", "Number of buffers in the output context",\
+- OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
++ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
++
++#define FF_V4L2_M2M_TRACK_SIZE 128
++typedef struct V4L2m2mTrackEl {
++ int discard; // If we see this buffer its been flushed, so discard
++ int pending;
++ int pkt_size;
++ int64_t pts;
++ int64_t dts;
++ int64_t reordered_opaque;
++ int64_t pkt_pos;
++ int64_t pkt_duration;
++ int64_t track_pts;
++} V4L2m2mTrackEl;
++
++typedef struct pts_stats_s
++{
++ void * logctx;
++ const char * name; // For debug
++ unsigned int last_count;
++ unsigned int last_interval;
++ int64_t last_pts;
++ int64_t guess;
++} pts_stats_t;
++
++typedef struct xlat_track_s {
++ unsigned int track_no;
++ int64_t last_pts;
++ int64_t last_opaque;
++ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
++} xlat_track_t;
+
+ typedef struct V4L2m2mContext {
+ char devname[PATH_MAX];
+@@ -52,7 +83,6 @@ typedef struct V4L2m2mContext {
+ AVCodecContext *avctx;
+ sem_t refsync;
+ atomic_uint refcount;
+- int reinit;
+
+ /* null frame/packet received */
+ int draining;
+@@ -63,6 +93,36 @@ typedef struct V4L2m2mContext {
+
+ /* reference back to V4L2m2mPriv */
+ void *priv;
++
++ AVBufferRef *device_ref;
++
++ /* generate DRM frames */
++ int output_drm;
++
++ /* input frames are drmprime */
++ int input_drm;
++
++ /* Frame tracking */
++ xlat_track_t xlat;
++ int pending_hw;
++ int pending_n;
++
++ pts_stats_t pts_stat;
++
++ /* req pkt */
++ int req_pkt;
++
++ /* Ext data sent */
++ int extdata_sent;
++ /* Ext data sent in packet - overrides ctx */
++ uint8_t * extdata_data;
++ size_t extdata_size;
++
++#define FF_V4L2_QUIRK_REINIT_ALWAYS 1
++#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN 2
++ /* Quirks */
++ unsigned int quirks;
++
+ } V4L2m2mContext;
+
+ typedef struct V4L2m2mPriv {
+@@ -73,6 +133,7 @@ typedef struct V4L2m2mPriv {
+
+ int num_output_buffers;
+ int num_capture_buffers;
++ enum AVPixelFormat pix_fmt;
+ } V4L2m2mPriv;
+
+ /**
+@@ -126,4 +187,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mCont
+ */
+ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
+
++
++static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++}
++
++static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++}
++
++static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
++{
++ return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
++}
++
++static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
++{
++ return ctx->flag_last;
++}
++
++
+ #endif /* AVCODEC_V4L2_M2M_H */
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -23,6 +23,10 @@
+
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
++
++#include "libavutil/avassert.h"
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
+ #include "libavutil/pixfmt.h"
+ #include "libavutil/pixdesc.h"
+ #include "libavutil/opt.h"
+@@ -30,75 +34,111 @@
+ #include "libavcodec/decode.h"
+ #include "libavcodec/internal.h"
+
++#include "libavcodec/hwaccels.h"
++#include "libavcodec/internal.h"
++#include "libavcodec/hwconfig.h"
++
+ #include "v4l2_context.h"
+ #include "v4l2_m2m.h"
+ #include "v4l2_fmt.h"
+
+-static int v4l2_try_start(AVCodecContext *avctx)
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++
++#ifndef FF_API_BUFFER_SIZE_T
++#define FF_API_BUFFER_SIZE_T 1
++#endif
++
++static int64_t pts_stats_guess(const pts_stats_t * const stats)
+ {
+- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+- V4L2Context *const capture = &s->capture;
+- V4L2Context *const output = &s->output;
+- struct v4l2_selection selection = { 0 };
+- int ret;
++ if (stats->last_pts == AV_NOPTS_VALUE ||
++ stats->last_interval == 0 ||
++ stats->last_count >= STATS_LAST_COUNT_MAX)
++ return AV_NOPTS_VALUE;
++ return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
++}
+
+- /* 1. start the output process */
+- if (!output->streamon) {
+- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
+- if (ret < 0) {
+- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
+- return ret;
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++ if (stats->last_count < STATS_LAST_COUNT_MAX)
++ ++stats->last_count;
++ return;
++ }
++
++ if (stats->last_pts != AV_NOPTS_VALUE) {
++ const int64_t interval = pts - stats->last_pts;
++
++ if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++ stats->last_count >= STATS_LAST_COUNT_MAX) {
++ if (stats->last_interval != 0)
++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++ __func__, stats->name, interval, stats->last_count);
++ stats->last_interval = 0;
++ }
++ else {
++ const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++ if (frame_time != stats->last_interval)
++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++ stats->last_interval = frame_time;
+ }
+ }
+
+- if (capture->streamon)
++ stats->last_pts = pts;
++ stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++ *stats = (pts_stats_t){
++ .logctx = logctx,
++ .name = name,
++ .last_count = 1,
++ .last_interval = 0,
++ .last_pts = AV_NOPTS_VALUE
++ };
++}
++
++static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
++{
++ int ret;
++ struct v4l2_decoder_cmd cmd = {
++ .cmd = V4L2_DEC_CMD_START,
++ .flags = 0,
++ };
++
++ if (s->output.streamon)
+ return 0;
+
+- /* 2. get the capture format */
+- capture->format.type = capture->type;
+- ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
+- if (ret) {
+- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
++ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
++ if (ret != 0) {
++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
+ return ret;
+ }
+
+- /* 2.1 update the AVCodecContext */
+- avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
+- capture->av_pix_fmt = avctx->pix_fmt;
+-
+- /* 3. set the crop parameters */
+- selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+- selection.r.height = avctx->coded_height;
+- selection.r.width = avctx->coded_width;
+- ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
+- if (!ret) {
+- ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
+- if (ret) {
+- av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
+- } else {
+- av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
+- /* update the size of the resulting frame */
+- capture->height = selection.r.height;
+- capture->width = selection.r.width;
+- }
++ // STREAMON should do implicit START so this just for those that don't.
++ // It is optional so don't worry if it fails
++ if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
++ ret = AVERROR(errno);
++ av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
+ }
+-
+- /* 4. init the capture context now that we have the capture format */
+- if (!capture->buffers) {
+- ret = ff_v4l2_context_init(capture);
+- if (ret) {
+- av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
+- return AVERROR(ENOMEM);
+- }
++ else {
++ av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
+ }
++ return 0;
++}
+
+- /* 5. start the capture process */
+- ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+- if (ret) {
+- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
+- return ret;
+- }
++static int v4l2_try_start(AVCodecContext *avctx)
++{
++ V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++ int ret;
+
++ /* 1. start the output process */
++ if ((ret = check_output_streamon(avctx, s)) != 0)
++ return ret;
+ return 0;
+ }
+
+@@ -133,52 +173,525 @@ static int v4l2_prepare_decoder(V4L2m2mC
+ return 0;
+ }
+
+-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++static void
++set_best_effort_pts(AVCodecContext *const avctx,
++ pts_stats_t * const ps,
++ AVFrame *const frame)
++{
++ pts_stats_add(ps, frame->pts);
++
++#if FF_API_PKT_PTS
++FF_DISABLE_DEPRECATION_WARNINGS
++ frame->pkt_pts = frame->pts;
++FF_ENABLE_DEPRECATION_WARNINGS
++#endif
++ frame->best_effort_timestamp = pts_stats_guess(ps);
++ // If we can't guess from just PTS - try DTS
++ if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
++ frame->best_effort_timestamp = frame->pkt_dts;
++
++ // We can't emulate what s/w does in a useful manner and using the
++ // "correct" answer seems to just confuse things.
++ frame->pkt_dts = frame->pts;
++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
++}
++
++static void
++xlat_flush(xlat_track_t * const x)
++{
++ unsigned int i;
++ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
++ x->track_els[i].pending = 0;
++ x->track_els[i].discard = 1;
++ }
++ x->last_pts = AV_NOPTS_VALUE;
++}
++
++static int
++xlat_pending(const xlat_track_t * const x)
++{
++ unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
++ unsigned int i;
++ int r = 0;
++ int64_t now = AV_NOPTS_VALUE;
++
++ for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
++ const V4L2m2mTrackEl * const t = x->track_els + n;
++
++ if (!t->pending)
++ continue;
++
++ if (now == AV_NOPTS_VALUE)
++ now = t->dts;
++
++ if (t->pts == AV_NOPTS_VALUE ||
++ ((now == AV_NOPTS_VALUE || t->pts <= now) &&
++ (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
++ ++r;
++ }
++
++ // If we never get any ideas about PTS vs DTS allow a lot more buffer
++ if (now == AV_NOPTS_VALUE)
++ r -= 16;
++
++ return r;
++}
++
++static inline int stream_started(const V4L2m2mContext * const s) {
++ return s->output.streamon;
++}
++
++#define NQ_OK 0
++#define NQ_Q_FULL 1
++#define NQ_SRC_EMPTY 2
++#define NQ_NONE 3
++#define NQ_DRAINING 4
++#define NQ_DEAD 5
++
++#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
++#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
++
++// do_not_get If true then no new packet will be got but status will
++// be set appropriately
++
++// AVERROR_EOF Flushing an already flushed stream
++// -ve Error (all errors except EOF are unexpected)
++// NQ_OK (0) OK
++// NQ_Q_FULL Dst full (retry if we think V4L2 Q has space now)
++// NQ_SRC_EMPTY Src empty (do not retry)
++// NQ_NONE Enqueue not attempted
++// NQ_DRAINING At EOS, dQ dest until EOS there too
++// NQ_DEAD Not running (do not retry, do not attempt capture dQ)
++
++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
+ {
+- V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+- V4L2Context *const capture = &s->capture;
+- V4L2Context *const output = &s->output;
+- AVPacket avpkt = {0};
+ int ret;
+
+- if (s->buf_pkt.size) {
+- avpkt = s->buf_pkt;
+- memset(&s->buf_pkt, 0, sizeof(AVPacket));
+- } else {
+- ret = ff_decode_get_packet(avctx, &avpkt);
+- if (ret < 0 && ret != AVERROR_EOF)
++ // If we don't already have a coded packet - get a new one
++ // We will already have a coded pkt if the output Q was full last time we
++ // tried to Q it
++ if (!s->buf_pkt.size && !do_not_get) {
++ unsigned int i;
++
++ for (i = 0; i < 256; ++i) {
++ uint8_t * side_data;
++#if FF_API_BUFFER_SIZE_T
++ int side_size;
++#else
++ size_t side_size;
++#endif
++ ret = ff_decode_get_packet(avctx, &s->buf_pkt);
++ if (ret != 0)
++ break;
++
++ // New extradata is the only side-data we undertand
++ side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
++ if (side_data) {
++ av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
++ av_freep(&s->extdata_data);
++ if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d bytes of extra data\n", (int)side_size);
++ return AVERROR(ENOMEM);
++ }
++ memcpy(s->extdata_data, side_data, side_size);
++ s->extdata_size = side_size;
++ s->extdata_sent = 0;
++ }
++
++ if (s->buf_pkt.size != 0)
++ break;
++
++ if (s->buf_pkt.side_data_elems == 0) {
++ av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
++ ret = AVERROR_EOF;
++ break;
++ }
++
++ // Retry a side-data only pkt
++ }
++ // If i >= 256 something has gone wrong
++ if (i >= 256) {
++ av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
++ return AVERROR(EIO);
++ }
++
++ if (ret == AVERROR(EAGAIN)) {
++ if (!stream_started(s)) {
++ av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
++ return NQ_DEAD;
++ }
++ return NQ_SRC_EMPTY;
++ }
++
++ if (ret == AVERROR_EOF) {
++ // EOF - enter drain mode
++ av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
++ ret, s->buf_pkt.size, stream_started(s), s->draining);
++ if (!stream_started(s)) {
++ av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
++ s->draining = 1;
++ s->capture.done = 1;
++ return AVERROR_EOF;
++ }
++
++ if (!s->draining) {
++ // Calling enqueue with an empty pkt starts drain
++ av_assert0(s->buf_pkt.size == 0);
++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++ if (ret) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
++ return ret;
++ }
++ }
++ return NQ_DRAINING;
++ }
++
++ if (ret < 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
+ return ret;
++ }
+ }
+
+- if (s->draining)
+- goto dequeue;
++ if (s->draining) {
++ if (s->buf_pkt.size) {
++ av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
++ av_packet_unref(&s->buf_pkt);
++ }
++ return NQ_DRAINING;
++ }
+
+- ret = ff_v4l2_context_enqueue_packet(output, &avpkt);
+- if (ret < 0) {
+- if (ret != AVERROR(EAGAIN))
+- return ret;
++ if (!s->buf_pkt.size)
++ return NQ_NONE;
+
+- s->buf_pkt = avpkt;
+- /* no input buffers available, continue dequeing */
+- }
++ if ((ret = check_output_streamon(avctx, s)) != 0)
++ return ret;
++
++ if (s->extdata_sent)
++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++ else if (s->extdata_data)
++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
++ else
++ ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
++
++ if (ret == AVERROR(EAGAIN)) {
++ // Out of input buffers - keep packet
++ ret = NQ_Q_FULL;
++ }
++ else {
++ // In all other cases we are done with this packet
++ av_packet_unref(&s->buf_pkt);
++ s->extdata_sent = 1;
+
+- if (avpkt.size) {
+- ret = v4l2_try_start(avctx);
+ if (ret) {
+- av_packet_unref(&avpkt);
++ av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
++ return ret;
++ }
++ }
+
+- /* cant recover */
+- if (ret == AVERROR(ENOMEM))
+- return ret;
++ // Start if we haven't
++ {
++ const int ret2 = v4l2_try_start(avctx);
++ if (ret2) {
++ av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
++ ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
++ }
++ }
++
++ return ret;
++}
++
++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
++{
++ int rv = 0;
+
+- return 0;
++ ff_mutex_lock(&ctx->lock);
++
++ while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
++ if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
++ rv = AVERROR(errno);
++ av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
++ break;
+ }
+ }
+
+-dequeue:
+- if (!s->buf_pkt.size)
+- av_packet_unref(&avpkt);
+- return ff_v4l2_context_dequeue_frame(capture, frame, -1);
++ ff_mutex_unlock(&ctx->lock);
++ return rv;
++}
++
++// Number of frames over what xlat_pending returns that we keep *16
++// This is a min value - if it appears to be too small the threshold should
++// adjust dynamically.
++#define PENDING_HW_MIN (3 * 16)
++// Offset to use when setting dynamically
++// Set to %16 == 15 to avoid the threshold changing immediately as we relax
++#define PENDING_HW_OFFSET (PENDING_HW_MIN - 1)
++// Number of consecutive times we've failed to get a frame when we prefer it
++// before we increase the prefer threshold (5ms * N = max expected decode
++// time)
++#define PENDING_N_THRESHOLD 6
++
++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++ V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++ int src_rv = NQ_OK;
++ int dst_rv = 1; // Non-zero (done), non-negative (error) number
++ unsigned int i = 0;
++
++ do {
++ const int pending = xlat_pending(&s->xlat);
++ const int prefer_dq = (pending > s->pending_hw / 16);
++ const int last_src_rv = src_rv;
++
++ // Enqueue another pkt for decode if
++ // (a) We don't have a lot of stuff in the buffer already OR
++ // (b) ... we (think we) do but we've failed to get a frame already OR
++ // (c) We've dequeued a lot of frames without asking for input
++ src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
++
++ // If we got a frame last time or we've already tried to get a frame and
++ // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
++ // indicating that we want more input.
++ // This should mean that once decode starts we enter a stable state where
++ // we alternately ask for input and produce output
++ if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
++ break;
++
++ if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
++ av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
++ break;
++ }
++
++ // Try to get a new frame if
++ // (a) we haven't already got one AND
++ // (b) enqueue returned a status indicating that decode should be attempted
++ if (dst_rv != 0 && TRY_DQ(src_rv)) {
++ // Pick a timeout depending on state
++ const int t =
++ src_rv == NQ_DRAINING ? 300 :
++ prefer_dq ? 5 :
++ src_rv == NQ_Q_FULL ? -1 : 0;
++
++ // Dequeue frame will unref any previous contents of frame
++ // if it returns success so we don't need an explicit unref
++ // when discarding
++ // This returns AVERROR(EAGAIN) on timeout or if
++ // there is room in the input Q and timeout == -1
++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++
++ // Failure due to no buffer in Q?
++ if (dst_rv == AVERROR(ENOSPC)) {
++ // Wait & retry
++ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++ }
++ }
++
++ // Adjust dynamic pending threshold
++ if (dst_rv == 0) {
++ if (--s->pending_hw < PENDING_HW_MIN)
++ s->pending_hw = PENDING_HW_MIN;
++ s->pending_n = 0;
++
++ set_best_effort_pts(avctx, &s->pts_stat, frame);
++ }
++ else if (dst_rv == AVERROR(EAGAIN)) {
++ if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
++ s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
++ s->pending_n = 0;
++ }
++ }
++
++ if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
++ av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
++ dst_rv = AVERROR_EOF;
++ s->capture.done = 1;
++ }
++ else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
++ av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
++ s->draining, s->capture.done);
++ else if (dst_rv && dst_rv != AVERROR(EAGAIN))
++ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
++ s->draining, s->capture.done, dst_rv);
++ }
++
++ ++i;
++ if (i >= 256) {
++ av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
++ src_rv = AVERROR(EIO);
++ }
++
++ // Continue trying to enqueue packets if either
++ // (a) we succeeded last time OR
++ // (b) we didn't ret a frame and we can retry the input
++ } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
++
++ // Ensure that the frame contains nothing if we aren't returning a frame
++ // (might happen when discarding)
++ if (dst_rv)
++ av_frame_unref(frame);
++
++ // If we got a frame this time ask for a pkt next time
++ s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
++
++#if 0
++ if (dst_rv == 0)
++ {
++ static int z = 0;
++ if (++z > 50) {
++ av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
++ ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++ return -1;
++ }
++ }
++#endif
++
++ return dst_rv == 0 ? 0 :
++ src_rv < 0 ? src_rv :
++ dst_rv < 0 ? dst_rv :
++ AVERROR(EAGAIN);
++}
++
++#if 0
++#include <time.h>
++static int64_t us_time(void)
++{
++ struct timespec ts;
++ clock_gettime(CLOCK_MONOTONIC, &ts);
++ return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
++}
++
++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++ int ret;
++ const int64_t now = us_time();
++ int64_t done;
++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++ ret = v4l2_receive_frame2(avctx, frame);
++ done = us_time();
++ av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
++ return ret;
++}
++#endif
++
++static int
++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++ unsigned int i;
++ const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
++ const uint32_t w = avctx->coded_width;
++ const uint32_t h = avctx->coded_height;
++
++ if (w == 0 || h == 0 || fcc == 0) {
++ av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
++ return 0;
++ }
++ if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
++ av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
++ return 0;
++ }
++
++ for (i = 0;; ++i) {
++ struct v4l2_frmsizeenum fs = {
++ .index = i,
++ .pixel_format = fcc,
++ };
++
++ while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
++ const int err = AVERROR(errno);
++ if (err == AVERROR(EINTR))
++ continue;
++ if (i == 0 && err == AVERROR(ENOTTY)) {
++ av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
++ return 0;
++ }
++ if (err != AVERROR(EINVAL)) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
++ return err;
++ }
++ av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
++ w, h, av_fourcc2str(fcc), i);
++ return err;
++ }
++
++ switch (fs.type) {
++ case V4L2_FRMSIZE_TYPE_DISCRETE:
++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
++ fs.discrete.width,fs.discrete.height);
++ if (w == fs.discrete.width && h == fs.discrete.height)
++ return 0;
++ break;
++ case V4L2_FRMSIZE_TYPE_STEPWISE:
++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++ fs.stepwise.min_width, fs.stepwise.min_height,
++ fs.stepwise.max_width, fs.stepwise.max_height,
++ fs.stepwise.step_width,fs.stepwise.step_height);
++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
++ (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
++ (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
++ return 0;
++ break;
++ case V4L2_FRMSIZE_TYPE_CONTINUOUS:
++ av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++ fs.stepwise.min_width, fs.stepwise.min_height,
++ fs.stepwise.max_width, fs.stepwise.max_height,
++ fs.stepwise.step_width,fs.stepwise.step_height);
++ if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++ h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
++ return 0;
++ break;
++ default:
++ av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
++ return AVERROR(EINVAL);
++ }
++ }
++}
++
++static int
++get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++ struct v4l2_capability cap;
++
++ memset(&cap, 0, sizeof(cap));
++ while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
++ int err = errno;
++ if (err == EINTR)
++ continue;
++ av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
++ return AVERROR(err);
++ }
++
++ // Could be made table driven if we have a few more but right now there
++ // seems no point
++
++ // Meson (amlogic) always gives a resolution changed event after output
++ // streamon and userspace must (re)allocate capture buffers and streamon
++ // capture to clear the event even if the capture buffers were the right
++ // size in the first place.
++ if (strcmp(cap.driver, "meson-vdec") == 0)
++ s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
++
++ av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
++ return 0;
++}
++
++// This heuristic is for H264 but use for everything
++static uint32_t max_coded_size(const AVCodecContext * const avctx)
++{
++ uint32_t wxh = avctx->coded_width * avctx->coded_height;
++ uint32_t size;
++
++ size = wxh * 3 / 2;
++ // H.264 Annex A table A-1 gives minCR which is either 2 or 4
++ // unfortunately that doesn't yield an actually useful limit
++ // and it should be noted that frame 0 is special cased to allow
++ // a bigger number which really isn't helpful for us. So just pick
++ // frame_size / 2
++ size /= 2;
++ // Add 64k to allow for any overheads and/or encoder hopefulness
++ // with small WxH
++ return size + (1 << 16);
+ }
+
+ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+@@ -186,12 +699,29 @@ static av_cold int v4l2_decode_init(AVCo
+ V4L2Context *capture, *output;
+ V4L2m2mContext *s;
+ V4L2m2mPriv *priv = avctx->priv_data;
++ int gf_pix_fmt;
+ int ret;
+
++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++ if (avctx->codec_id == AV_CODEC_ID_H264) {
++ if (avctx->ticks_per_frame == 1) {
++ if(avctx->time_base.den < INT_MAX/2) {
++ avctx->time_base.den *= 2;
++ } else
++ avctx->time_base.num /= 2;
++ }
++ avctx->ticks_per_frame = 2;
++ }
++
++ av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
+ ret = ff_v4l2_m2m_create_context(priv, &s);
+ if (ret < 0)
+ return ret;
+
++ pts_stats_init(&s->pts_stat, avctx, "decoder");
++ s->pending_hw = PENDING_HW_MIN;
++
+ capture = &s->capture;
+ output = &s->output;
+
+@@ -199,34 +729,127 @@ static av_cold int v4l2_decode_init(AVCo
+ * by the v4l2 driver; this event will trigger a full pipeline reconfig and
+ * the proper values will be retrieved from the kernel driver.
+ */
+- output->height = capture->height = avctx->coded_height;
+- output->width = capture->width = avctx->coded_width;
++// output->height = capture->height = avctx->coded_height;
++// output->width = capture->width = avctx->coded_width;
++ output->height = capture->height = 0;
++ output->width = capture->width = 0;
+
+ output->av_codec_id = avctx->codec_id;
+ output->av_pix_fmt = AV_PIX_FMT_NONE;
++ output->min_buf_size = max_coded_size(avctx);
+
+ capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+ capture->av_pix_fmt = avctx->pix_fmt;
++ capture->min_buf_size = 0;
++
++ /* the client requests the codec to generate DRM frames:
++ * - data[0] will therefore point to the returned AVDRMFrameDescriptor
++ * check the ff_v4l2_buffer_to_avframe conversion function.
++ * - the DRM frame format is passed in the DRM frame descriptor layer.
++ * check the v4l2_get_drm_frame function.
++ */
++
++ avctx->sw_pix_fmt = avctx->pix_fmt;
++ gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
++ av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
++ avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
++ avctx->coded_width, avctx->coded_height,
++ gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
++
++ if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++ s->output_drm = 1;
++ }
++ else {
++ capture->av_pix_fmt = gf_pix_fmt;
++ s->output_drm = 0;
++ }
++
++ s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
++ if (!s->device_ref) {
++ ret = AVERROR(ENOMEM);
++ return ret;
++ }
++
++ ret = av_hwdevice_ctx_init(s->device_ref);
++ if (ret < 0)
++ return ret;
+
+ s->avctx = avctx;
+ ret = ff_v4l2_m2m_codec_init(priv);
+ if (ret) {
+ av_log(avctx, AV_LOG_ERROR, "can't configure decoder\n");
+- s->self_ref = NULL;
+- av_buffer_unref(&priv->context_ref);
+-
+ return ret;
+ }
+
+- return v4l2_prepare_decoder(s);
++ if ((ret = v4l2_prepare_decoder(s)) < 0)
++ return ret;
++
++ if ((ret = get_quirks(avctx, s)) != 0)
++ return ret;
++
++ if ((ret = check_size(avctx, s)) != 0)
++ return ret;
++
++ return 0;
+ }
+
+ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+ {
+- V4L2m2mPriv *priv = avctx->priv_data;
+- V4L2m2mContext *s = priv->context;
++ int rv;
++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++ rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
++ av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
++ return rv;
++}
++
++static void v4l2_decode_flush(AVCodecContext *avctx)
++{
++ // An alternatve and more drastic form of flush is to simply do this:
++ // v4l2_decode_close(avctx);
++ // v4l2_decode_init(avctx);
++ // The downside is that this keeps a decoder open until all the frames
++ // associated with it have been returned. This is a bit wasteful on
++ // possibly limited h/w resources and fails on a Pi for this reason unless
++ // more GPU mem is allocated than is the default.
++
++ V4L2m2mPriv * const priv = avctx->priv_data;
++ V4L2m2mContext * const s = priv->context;
++ V4L2Context * const output = &s->output;
++ V4L2Context * const capture = &s->capture;
++
++ av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
++
++ // Reflushing everything is benign, quick and avoids having to worry about
++ // states like EOS processing so don't try to optimize out (having got it
++ // wrong once)
++
++ ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
++
++ // Clear any buffered input packet
+ av_packet_unref(&s->buf_pkt);
+- return ff_v4l2_m2m_codec_end(priv);
++
++ // Clear a pending EOS
++ if (ff_v4l2_ctx_eos(capture)) {
++ // Arguably we could delay this but this is easy and doesn't require
++ // thought or extra vars
++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
++ ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
++ }
++
++ // V4L2 makes no guarantees about whether decoded frames are flushed or not
++ // so mark all frames we are tracking to be discarded if they appear
++ xlat_flush(&s->xlat);
++
++ // resend extradata
++ s->extdata_sent = 0;
++ // clear EOS status vars
++ s->draining = 0;
++ output->done = 0;
++ capture->done = 0;
++
++ // Stream on will occur when we actually submit a new frame
++ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
+ }
+
+ #define OFFSET(x) offsetof(V4L2m2mPriv, x)
+@@ -235,10 +858,16 @@ static av_cold int v4l2_decode_close(AVC
+ static const AVOption options[] = {
+ V4L_M2M_DEFAULT_OPTS,
+ { "num_capture_buffers", "Number of buffers in the capture context",
+- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS },
++ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
++ { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
+ { NULL},
+ };
+
++static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
++ HW_CONFIG_INTERNAL(DRM_PRIME),
++ NULL
++};
++
+ #define M2MDEC_CLASS(NAME) \
+ static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
+ .class_name = #NAME "_v4l2m2m_decoder", \
+@@ -259,9 +888,15 @@ static const AVOption options[] = {
+ .init = v4l2_decode_init, \
+ .receive_frame = v4l2_receive_frame, \
+ .close = v4l2_decode_close, \
++ .flush = v4l2_decode_flush, \
+ .bsfs = bsf_name, \
+ .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
+- .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS, \
++ .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
++ .pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
++ AV_PIX_FMT_NV12, \
++ AV_PIX_FMT_YUV420P, \
++ AV_PIX_FMT_NONE}, \
++ .hw_configs = v4l2_m2m_hw_configs, \
+ .wrapper_name = "v4l2m2m", \
+ }
+
+--- a/libavcodec/v4l2_m2m_enc.c
++++ b/libavcodec/v4l2_m2m_enc.c
+@@ -24,6 +24,8 @@
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <search.h>
++#include <drm_fourcc.h>
++
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+ #include "libavutil/pixdesc.h"
+@@ -37,6 +39,34 @@
+ #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
+ #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
+
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in videodev2.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
+ static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
+ {
+ struct v4l2_streamparm parm = { 0 };
+@@ -147,15 +177,14 @@ static inline int v4l2_mpeg4_profile_fro
+ static int v4l2_check_b_frame_support(V4L2m2mContext *s)
+ {
+ if (s->avctx->max_b_frames)
+- av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
++ av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
+
+- v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
++ v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
+ v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
+ if (s->avctx->max_b_frames == 0)
+ return 0;
+
+ avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
+-
+ return AVERROR_PATCHWELCOME;
+ }
+
+@@ -270,13 +299,184 @@ static int v4l2_prepare_encoder(V4L2m2mC
+ return 0;
+ }
+
++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
++{
++ const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++ const uint32_t drm_fmt = src->layers[0].format;
++ // Treat INVALID as LINEAR
++ const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
++ DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
++ uint32_t pix_fmt = 0;
++ uint32_t w = 0;
++ uint32_t h = 0;
++ uint32_t bpl = src->layers[0].planes[0].pitch;
++
++ // We really don't expect multiple layers
++ // All formats that we currently cope with are single object
++
++ if (src->nb_layers != 1 || src->nb_objects != 1)
++ return AVERROR(EINVAL);
++
++ switch (drm_fmt) {
++ case DRM_FORMAT_YUV420:
++ if (mod == DRM_FORMAT_MOD_LINEAR) {
++ if (src->layers[0].nb_planes != 3)
++ break;
++ pix_fmt = V4L2_PIX_FMT_YUV420;
++ h = src->layers[0].planes[1].offset / bpl;
++ w = bpl;
++ }
++ break;
++
++ case DRM_FORMAT_NV12:
++ if (mod == DRM_FORMAT_MOD_LINEAR) {
++ if (src->layers[0].nb_planes != 2)
++ break;
++ pix_fmt = V4L2_PIX_FMT_NV12;
++ h = src->layers[0].planes[1].offset / bpl;
++ w = bpl;
++ }
++ else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++ if (src->layers[0].nb_planes != 2)
++ break;
++ pix_fmt = V4L2_PIX_FMT_NV12_COL128;
++ w = bpl;
++ h = src->layers[0].planes[1].offset / 128;
++ bpl = fourcc_mod_broadcom_param(mod);
++ }
++ break;
++
++ case DRM_FORMAT_P030:
++ if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++ if (src->layers[0].nb_planes != 2)
++ break;
++ pix_fmt = V4L2_PIX_FMT_NV12_10_COL128;
++ w = bpl / 2; // Matching lie to how we construct this
++ h = src->layers[0].planes[1].offset / 128;
++ bpl = fourcc_mod_broadcom_param(mod);
++ }
++ break;
++
++ default:
++ break;
++ }
++
++ if (!pix_fmt)
++ return AVERROR(EINVAL);
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++ struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
++
++ pix->width = w;
++ pix->height = h;
++ pix->pixelformat = pix_fmt;
++ pix->plane_fmt[0].bytesperline = bpl;
++ pix->num_planes = 1;
++ }
++ else {
++ struct v4l2_pix_format *const pix = &format->fmt.pix;
++
++ pix->width = w;
++ pix->height = h;
++ pix->pixelformat = pix_fmt;
++ pix->bytesperline = bpl;
++ }
++
++ return 0;
++}
++
++// Do we have similar enough formats to be usable?
++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
++{
++ if (a->type != b->type)
++ return 0;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
++ const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
++ const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
++ unsigned int i;
++ if (pa->pixelformat != pb->pixelformat ||
++ pa->num_planes != pb->num_planes)
++ return 0;
++ for (i = 0; i != pa->num_planes; ++i) {
++ if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
++ return 0;
++ }
++ }
++ else {
++ const struct v4l2_pix_format *const pa = &a->fmt.pix;
++ const struct v4l2_pix_format *const pb = &b->fmt.pix;
++ if (pa->pixelformat != pb->pixelformat ||
++ pa->bytesperline != pb->bytesperline)
++ return 0;
++ }
++ return 1;
++}
++
++
+ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
+ {
+ V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+ V4L2Context *const output = &s->output;
+
++ // Signal EOF if needed
++ if (!frame) {
++ return ff_v4l2_context_enqueue_frame(output, frame);
++ }
++
++ if (s->input_drm && !output->streamon) {
++ int rv;
++ struct v4l2_format req_format = {.type = output->format.type};
++
++ // Set format when we first get a buffer
++ if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
++ return rv;
++ }
++
++ ff_v4l2_context_release(output);
++
++ output->format = req_format;
++
++ if ((rv = ff_v4l2_context_set_format(output)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
++ return rv;
++ }
++
++ if (!fmt_eq(&req_format, &output->format)) {
++ av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
++ return AVERROR(EINVAL);
++ }
++
++ output->selection.top = frame->crop_top;
++ output->selection.left = frame->crop_left;
++ output->selection.width = av_frame_cropped_width(frame);
++ output->selection.height = av_frame_cropped_height(frame);
++
++ if ((rv = ff_v4l2_context_init(output)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
++ return rv;
++ }
++
++ {
++ struct v4l2_selection selection = {
++ .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
++ .target = V4L2_SEL_TGT_CROP,
++ .r = output->selection
++ };
++ if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
++ av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
++ selection.r.width, selection.r.height, selection.r.left, selection.r.top,
++ av_err2str(AVERROR(errno)));
++ }
++ av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
++ selection.r.width, selection.r.height, selection.r.left, selection.r.top);
++ }
++ }
++
+ #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
+- if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
++ if (frame->pict_type == AV_PICTURE_TYPE_I)
+ v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
+ #endif
+
+@@ -310,7 +510,70 @@ static int v4l2_receive_packet(AVCodecCo
+ }
+
+ dequeue:
+- return ff_v4l2_context_dequeue_packet(capture, avpkt);
++ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++ return ret;
++
++ if (capture->first_buf == 1) {
++ uint8_t * data;
++ const int len = avpkt->size;
++
++ // 1st buffer after streamon should be SPS/PPS
++ capture->first_buf = 2;
++
++ // Clear both possible stores so there is no chance of confusion
++ av_freep(&s->extdata_data);
++ s->extdata_size = 0;
++ av_freep(&avctx->extradata);
++ avctx->extradata_size = 0;
++
++ if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
++ memcpy(data, avpkt->data, len);
++
++ av_packet_unref(avpkt);
++
++ if (data == NULL)
++ return AVERROR(ENOMEM);
++
++ // We need to copy the header, but keep local if not global
++ if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
++ avctx->extradata = data;
++ avctx->extradata_size = len;
++ }
++ else {
++ s->extdata_data = data;
++ s->extdata_size = len;
++ }
++
++ if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
++ return ret;
++ }
++
++ // First frame must be key so mark as such even if encoder forgot
++ if (capture->first_buf == 2)
++ avpkt->flags |= AV_PKT_FLAG_KEY;
++
++ // Add SPS/PPS to the start of every key frame if non-global headers
++ if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
++ const size_t newlen = s->extdata_size + avpkt->size;
++ AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
++
++ if (buf == NULL) {
++ av_packet_unref(avpkt);
++ return AVERROR(ENOMEM);
++ }
++
++ memcpy(buf->data, s->extdata_data, s->extdata_size);
++ memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
++
++ av_buffer_unref(&avpkt->buf);
++ avpkt->buf = buf;
++ avpkt->data = buf->data;
++ avpkt->size = newlen;
++ }
++
++// av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
++ capture->first_buf = 0;
++ return 0;
+ }
+
+ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+@@ -322,6 +585,8 @@ static av_cold int v4l2_encode_init(AVCo
+ uint32_t v4l2_fmt_output;
+ int ret;
+
++ av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
++
+ ret = ff_v4l2_m2m_create_context(priv, &s);
+ if (ret < 0)
+ return ret;
+@@ -329,13 +594,17 @@ static av_cold int v4l2_encode_init(AVCo
+ capture = &s->capture;
+ output = &s->output;
+
++ s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
++
+ /* common settings output/capture */
+ output->height = capture->height = avctx->height;
+ output->width = capture->width = avctx->width;
+
+ /* output context */
+ output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+- output->av_pix_fmt = avctx->pix_fmt;
++ output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
++ avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
++ AV_PIX_FMT_YUV420P;
+
+ /* capture context */
+ capture->av_codec_id = avctx->codec_id;
+@@ -354,7 +623,7 @@ static av_cold int v4l2_encode_init(AVCo
+ v4l2_fmt_output = output->format.fmt.pix.pixelformat;
+
+ pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
+- if (pix_fmt_output != avctx->pix_fmt) {
++ if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
+ av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
+ return AVERROR(EINVAL);
+--- /dev/null
++++ b/libavcodec/v4l2_req_decode_q.c
+@@ -0,0 +1,84 @@
++#include <memory.h>
++#include <semaphore.h>
++#include <pthread.h>
++
++#include "v4l2_req_decode_q.h"
++
++int decode_q_in_q(const req_decode_ent * const d)
++{
++ return d->in_q;
++}
++
++void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
++{
++ pthread_mutex_lock(&q->q_lock);
++ if (!q->head) {
++ q->head = d;
++ q->tail = d;
++ d->prev = NULL;
++ }
++ else {
++ q->tail->next = d;
++ d->prev = q->tail;
++ q->tail = d;
++ }
++ d->next = NULL;
++ d->in_q = 1;
++ pthread_mutex_unlock(&q->q_lock);
++}
++
++// Remove entry from Q - if head wake-up anything that was waiting
++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
++{
++ int try_signal = 0;
++
++ if (!d->in_q)
++ return;
++
++ pthread_mutex_lock(&q->q_lock);
++ if (d->prev)
++ d->prev->next = d->next;
++ else {
++ try_signal = 1; // Only need to signal if we were head
++ q->head = d->next;
++ }
++
++ if (d->next)
++ d->next->prev = d->prev;
++ else
++ q->tail = d->prev;
++
++ // Not strictly needed but makes debug easier
++ d->next = NULL;
++ d->prev = NULL;
++ d->in_q = 0;
++ pthread_mutex_unlock(&q->q_lock);
++
++ if (try_signal)
++ pthread_cond_broadcast(&q->q_cond);
++}
++
++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
++{
++ pthread_mutex_lock(&q->q_lock);
++
++ while (q->head != d)
++ pthread_cond_wait(&q->q_cond, &q->q_lock);
++
++ pthread_mutex_unlock(&q->q_lock);
++}
++
++void decode_q_uninit(req_decode_q * const q)
++{
++ pthread_mutex_destroy(&q->q_lock);
++ pthread_cond_destroy(&q->q_cond);
++}
++
++void decode_q_init(req_decode_q * const q)
++{
++ memset(q, 0, sizeof(*q));
++ pthread_mutex_init(&q->q_lock, NULL);
++ pthread_cond_init(&q->q_cond, NULL);
++}
++
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_decode_q.h
+@@ -0,0 +1,25 @@
++#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
++#define AVCODEC_V4L2_REQ_DECODE_Q_H
++
++typedef struct req_decode_ent {
++ struct req_decode_ent * next;
++ struct req_decode_ent * prev;
++ int in_q;
++} req_decode_ent;
++
++typedef struct req_decode_q {
++ pthread_mutex_t q_lock;
++ pthread_cond_t q_cond;
++ req_decode_ent * head;
++ req_decode_ent * tail;
++} req_decode_q;
++
++int decode_q_in_q(const req_decode_ent * const d);
++void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_uninit(req_decode_q * const q);
++void decode_q_init(req_decode_q * const q);
++
++#endif
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_devscan.c
+@@ -0,0 +1,449 @@
++#include <errno.h>
++#include <fcntl.h>
++#include <libudev.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++
++#include <sys/ioctl.h>
++#include <sys/sysmacros.h>
++
++#include <linux/media.h>
++#include <linux/videodev2.h>
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_utils.h"
++
++struct decdev {
++ enum v4l2_buf_type src_type;
++ uint32_t src_fmt_v4l2;
++ const char * vname;
++ const char * mname;
++};
++
++struct devscan {
++ struct decdev env;
++ unsigned int dev_size;
++ unsigned int dev_count;
++ struct decdev *devs;
++};
++
++static int video_src_pixfmt_supported(uint32_t fmt)
++{
++ return 1;
++}
++
++static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
++ unsigned int width, unsigned int height,
++ unsigned int pixelformat)
++{
++ unsigned int sizeimage;
++
++ memset(format, 0, sizeof(*format));
++ format->type = type;
++
++ sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
++ format->fmt.pix_mp.width = width;
++ format->fmt.pix_mp.height = height;
++ format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
++ format->fmt.pix_mp.pixelformat = pixelformat;
++ } else {
++ format->fmt.pix.width = width;
++ format->fmt.pix.height = height;
++ format->fmt.pix.sizeimage = sizeimage;
++ format->fmt.pix.pixelformat = pixelformat;
++ }
++}
++
++static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
++ unsigned int width, unsigned int height)
++{
++ struct v4l2_format format;
++
++ v4l2_setup_format(&format, type, width, height, pixelformat);
++
++ return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
++}
++
++static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
++{
++ struct v4l2_capability capability = { 0 };
++ int rc;
++
++ rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
++ if (rc < 0)
++ return -errno;
++
++ if (capabilities != NULL) {
++ if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
++ *capabilities = capability.device_caps;
++ else
++ *capabilities = capability.capabilities;
++ }
++
++ return 0;
++}
++
++static int devscan_add(struct devscan *const scan,
++ enum v4l2_buf_type src_type,
++ uint32_t src_fmt_v4l2,
++ const char * vname,
++ const char * mname)
++{
++ struct decdev *d;
++
++ if (scan->dev_size <= scan->dev_count) {
++ unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
++ d = realloc(scan->devs, n * sizeof(*d));
++ if (!d)
++ return -ENOMEM;
++ scan->devs = d;
++ scan->dev_size = n;
++ }
++
++ d = scan->devs + scan->dev_count;
++ d->src_type = src_type;
++ d->src_fmt_v4l2 = src_fmt_v4l2;
++ d->vname = strdup(vname);
++ if (!d->vname)
++ return -ENOMEM;
++ d->mname = strdup(mname);
++ if (!d->mname) {
++ free((char *)d->vname);
++ return -ENOMEM;
++ }
++ ++scan->dev_count;
++ return 0;
++}
++
++void devscan_delete(struct devscan **const pScan)
++{
++ unsigned int i;
++ struct devscan * const scan = *pScan;
++
++ if (!scan)
++ return;
++ *pScan = NULL;
++
++ for (i = 0; i < scan->dev_count; ++i) {
++ free((char*)scan->devs[i].mname);
++ free((char*)scan->devs[i].vname);
++ }
++ free(scan->devs);
++ free(scan);
++}
++
++#define REQ_BUF_CAPS (\
++ V4L2_BUF_CAP_SUPPORTS_DMABUF |\
++ V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
++ V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
++
++static void probe_formats(void * const dc,
++ struct devscan *const scan,
++ const int fd,
++ const unsigned int type_v4l2,
++ const char *const mpath,
++ const char *const vpath)
++{
++ unsigned int i;
++ for (i = 0;; ++i) {
++ struct v4l2_fmtdesc fmtdesc = {
++ .index = i,
++ .type = type_v4l2
++ };
++ struct v4l2_requestbuffers rbufs = {
++ .count = 0,
++ .type = type_v4l2,
++ .memory = V4L2_MEMORY_MMAP
++ };
++ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
++ if (errno == EINTR)
++ continue;
++ if (errno != EINVAL)
++ request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
++ return;
++ }
++ if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
++ continue;
++
++ if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
++ request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
++ continue;
++ }
++
++ while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
++ if (errno != EINTR) {
++ request_debug(dc, "%s: Reqbufs failed\n", vpath);
++ continue;
++ }
++ }
++
++ if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
++ request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
++ continue;
++ }
++
++ request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
++ mpath, vpath, fmtdesc.pixelformat, type_v4l2);
++ devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
++ }
++}
++
++
++static int probe_video_device(void * const dc,
++ struct udev_device *const device,
++ struct devscan *const scan,
++ const char *const mpath)
++{
++ int ret;
++ unsigned int capabilities = 0;
++ int video_fd = -1;
++
++ const char *path = udev_device_get_devnode(device);
++ if (!path) {
++ request_err(dc, "%s: get video device devnode failed\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ video_fd = open(path, O_RDWR, 0);
++ if (video_fd == -1) {
++ ret = -errno;
++ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
++ goto fail;
++ }
++
++ ret = v4l2_query_capabilities(video_fd, &capabilities);
++ if (ret < 0) {
++ request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
++
++ if (!(capabilities & V4L2_CAP_STREAMING)) {
++ request_debug(dc, "%s: missing required streaming capability\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
++ request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ /* Should check capture formats too... */
++ if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
++ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
++ if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
++ probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
++
++ close(video_fd);
++ return 0;
++
++fail:
++ if (video_fd >= 0)
++ close(video_fd);
++ return ret;
++}
++
++static int probe_media_device(void * const dc,
++ struct udev_device *const device,
++ struct devscan *const scan)
++{
++ int ret;
++ int rv;
++ struct media_device_info device_info = { 0 };
++ struct media_v2_topology topology = { 0 };
++ struct media_v2_interface *interfaces = NULL;
++ struct udev *udev = udev_device_get_udev(device);
++ struct udev_device *video_device;
++ dev_t devnum;
++ int media_fd = -1;
++
++ const char *path = udev_device_get_devnode(device);
++ if (!path) {
++ request_err(dc, "%s: get media device devnode failed\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ media_fd = open(path, O_RDWR, 0);
++ if (media_fd < 0) {
++ ret = -errno;
++ request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
++ if (rv < 0) {
++ ret = -errno;
++ request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++ if (rv < 0) {
++ ret = -errno;
++ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ if (topology.num_interfaces <= 0) {
++ request_err(dc, "%s: media device has no interfaces\n", __func__);
++ ret = -EINVAL;
++ goto fail;
++ }
++
++ interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
++ if (!interfaces) {
++ request_err(dc, "%s: allocating media interface struct failed\n", __func__);
++ ret = -ENOMEM;
++ goto fail;
++ }
++
++ topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
++ rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++ if (rv < 0) {
++ ret = -errno;
++ request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++ goto fail;
++ }
++
++ for (int i = 0; i < topology.num_interfaces; i++) {
++ if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
++ continue;
++
++ devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
++ video_device = udev_device_new_from_devnum(udev, 'c', devnum);
++ if (!video_device) {
++ ret = -errno;
++ request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
++ continue;
++ }
++
++ ret = probe_video_device(dc, video_device, scan, path);
++ udev_device_unref(video_device);
++
++ if (ret != 0)
++ goto fail;
++ }
++
++fail:
++ free(interfaces);
++ if (media_fd != -1)
++ close(media_fd);
++ return ret;
++}
++
++const char *decdev_media_path(const struct decdev *const dev)
++{
++ return !dev ? NULL : dev->mname;
++}
++
++const char *decdev_video_path(const struct decdev *const dev)
++{
++ return !dev ? NULL : dev->vname;
++}
++
++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
++{
++ return !dev ? 0 : dev->src_type;
++}
++
++uint32_t decdev_src_pixelformat(const struct decdev *const dev)
++{
++ return !dev ? 0 : dev->src_fmt_v4l2;
++}
++
++
++const struct decdev *devscan_find(struct devscan *const scan,
++ const uint32_t src_fmt_v4l2)
++{
++ unsigned int i;
++
++ if (scan->env.mname && scan->env.vname)
++ return &scan->env;
++
++ if (!src_fmt_v4l2)
++ return scan->dev_count ? scan->devs + 0 : NULL;
++
++ for (i = 0; i != scan->dev_count; ++i) {
++ if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
++ return scan->devs + i;
++ }
++ return NULL;
++}
++
++int devscan_build(void * const dc, struct devscan **pscan)
++{
++ int ret;
++ struct udev *udev;
++ struct udev_enumerate *enumerate;
++ struct udev_list_entry *devices;
++ struct udev_list_entry *entry;
++ struct udev_device *device;
++ struct devscan * scan;
++
++ *pscan = NULL;
++
++ scan = calloc(1, sizeof(*scan));
++ if (!scan) {
++ ret = -ENOMEM;
++ goto fail;
++ }
++
++ scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
++ scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
++ if (scan->env.mname && scan->env.vname) {
++ request_info(dc, "Media/video device env overrides found: %s,%s\n",
++ scan->env.mname, scan->env.vname);
++ *pscan = scan;
++ return 0;
++ }
++
++ udev = udev_new();
++ if (!udev) {
++ request_err(dc, "%s: allocating udev context failed\n", __func__);
++ ret = -ENOMEM;
++ goto fail;
++ }
++
++ enumerate = udev_enumerate_new(udev);
++ if (!enumerate) {
++ request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
++ ret = -ENOMEM;
++ goto fail;
++ }
++
++ udev_enumerate_add_match_subsystem(enumerate, "media");
++ udev_enumerate_scan_devices(enumerate);
++
++ devices = udev_enumerate_get_list_entry(enumerate);
++ udev_list_entry_foreach(entry, devices) {
++ const char *path = udev_list_entry_get_name(entry);
++ if (!path)
++ continue;
++
++ device = udev_device_new_from_syspath(udev, path);
++ if (!device)
++ continue;
++
++ probe_media_device(dc, device, scan);
++ udev_device_unref(device);
++ }
++
++ udev_enumerate_unref(enumerate);
++
++ *pscan = scan;
++ return 0;
++
++fail:
++ udev_unref(udev);
++ devscan_delete(&scan);
++ return ret;
++}
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_devscan.h
+@@ -0,0 +1,21 @@
++#ifndef _DEVSCAN_H_
++#define _DEVSCAN_H_
++
++struct devscan;
++struct decdev;
++enum v4l2_buf_type;
++
++/* These return pointers to data in the devscan structure and so are vaild
++ * for the lifetime of that
++ */
++const char *decdev_media_path(const struct decdev *const dev);
++const char *decdev_video_path(const struct decdev *const dev);
++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
++uint32_t decdev_src_pixelformat(const struct decdev *const dev);
++
++const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
++
++int devscan_build(void * const dc, struct devscan **pscan);
++void devscan_delete(struct devscan **const pScan);
++
++#endif
+--- /dev/null
++++ b/libavcodec/v4l2_req_dmabufs.c
+@@ -0,0 +1,266 @@
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <inttypes.h>
++#include <fcntl.h>
++#include <errno.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <linux/mman.h>
++#include <linux/dma-buf.h>
++#include <linux/dma-heap.h>
++
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_utils.h"
++
++#define DMABUF_NAME1 "/dev/dma_heap/linux,cma"
++#define DMABUF_NAME2 "/dev/dma_heap/reserved"
++
++#define TRACE_ALLOC 0
++
++struct dmabufs_ctl {
++ int fd;
++ size_t page_size;
++};
++
++struct dmabuf_h {
++ int fd;
++ size_t size;
++ size_t len;
++ void * mapptr;
++};
++
++#if TRACE_ALLOC
++static unsigned int total_bufs = 0;
++static size_t total_size = 0;
++#endif
++
++struct dmabuf_h * dmabuf_import(int fd, size_t size)
++{
++ struct dmabuf_h *dh;
++
++ fd = dup(fd);
++ if (fd < 0 || size == 0)
++ return NULL;
++
++ dh = malloc(sizeof(*dh));
++ if (!dh) {
++ close(fd);
++ return NULL;
++ }
++
++ *dh = (struct dmabuf_h) {
++ .fd = fd,
++ .size = size,
++ .mapptr = MAP_FAILED
++ };
++
++#if TRACE_ALLOC
++ ++total_bufs;
++ total_size += dh->size;
++ request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++ return dh;
++}
++
++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
++{
++ struct dmabuf_h * dh;
++ struct dma_heap_allocation_data data = {
++ .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
++ .fd = 0,
++ .fd_flags = O_RDWR,
++ .heap_flags = 0
++ };
++
++ if (old != NULL) {
++ if (old->size == data.len) {
++ return old;
++ }
++ dmabuf_free(old);
++ }
++
++ if (size == 0 ||
++ (dh = malloc(sizeof(*dh))) == NULL)
++ return NULL;
++
++ while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
++ int err = errno;
++ request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
++ (uint64_t)data.len,
++ dbsc->fd,
++ err,
++ strerror(err));
++ if (err == EINTR)
++ continue;
++ goto fail;
++ }
++
++ *dh = (struct dmabuf_h){
++ .fd = data.fd,
++ .size = (size_t)data.len,
++ .mapptr = MAP_FAILED
++ };
++
++#if TRACE_ALLOC
++ ++total_bufs;
++ total_size += dh->size;
++ request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++ return dh;
++
++fail:
++ free(dh);
++ return NULL;
++}
++
++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
++{
++ struct dma_buf_sync sync = {
++ .flags = flags
++ };
++ while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
++ const int err = errno;
++ if (errno == EINTR)
++ continue;
++ request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
++ return -err;
++ }
++ return 0;
++}
++
++int dmabuf_write_start(struct dmabuf_h * const dh)
++{
++ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
++}
++
++int dmabuf_write_end(struct dmabuf_h * const dh)
++{
++ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
++}
++
++int dmabuf_read_start(struct dmabuf_h * const dh)
++{
++ if (!dmabuf_map(dh))
++ return -1;
++ return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
++}
++
++int dmabuf_read_end(struct dmabuf_h * const dh)
++{
++ return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
++}
++
++
++void * dmabuf_map(struct dmabuf_h * const dh)
++{
++ if (!dh)
++ return NULL;
++ if (dh->mapptr != MAP_FAILED)
++ return dh->mapptr;
++ dh->mapptr = mmap(NULL, dh->size,
++ PROT_READ | PROT_WRITE,
++ MAP_SHARED | MAP_POPULATE,
++ dh->fd, 0);
++ if (dh->mapptr == MAP_FAILED) {
++ request_log("%s: Map failed\n", __func__);
++ return NULL;
++ }
++ return dh->mapptr;
++}
++
++int dmabuf_fd(const struct dmabuf_h * const dh)
++{
++ if (!dh)
++ return -1;
++ return dh->fd;
++}
++
++size_t dmabuf_size(const struct dmabuf_h * const dh)
++{
++ if (!dh)
++ return 0;
++ return dh->size;
++}
++
++size_t dmabuf_len(const struct dmabuf_h * const dh)
++{
++ if (!dh)
++ return 0;
++ return dh->len;
++}
++
++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
++{
++ dh->len = len;
++}
++
++
++
++void dmabuf_free(struct dmabuf_h * dh)
++{
++ if (!dh)
++ return;
++
++#if TRACE_ALLOC
++ --total_bufs;
++ total_size -= dh->size;
++ request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++ if (dh->mapptr != MAP_FAILED)
++ munmap(dh->mapptr, dh->size);
++ while (close(dh->fd) == -1 && errno == EINTR)
++ /* loop */;
++ free(dh);
++}
++
++struct dmabufs_ctl * dmabufs_ctl_new(void)
++{
++ struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
++
++ if (!dbsc)
++ return NULL;
++
++ while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
++ errno == EINTR)
++ /* Loop */;
++
++ if (dbsc->fd == -1) {
++ while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
++ errno == EINTR)
++ /* Loop */;
++ if (dbsc->fd == -1) {
++ request_log("Unable to open either %s or %s\n",
++ DMABUF_NAME1, DMABUF_NAME2);
++ goto fail;
++ }
++ }
++
++ dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
++
++ return dbsc;
++
++fail:
++ free(dbsc);
++ return NULL;
++}
++
++void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
++{
++ struct dmabufs_ctl * const dbsc = *pDbsc;
++
++ if (!dbsc)
++ return;
++ *pDbsc = NULL;
++
++ while (close(dbsc->fd) == -1 && errno == EINTR)
++ /* loop */;
++
++ free(dbsc);
++}
++
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_dmabufs.h
+@@ -0,0 +1,38 @@
++#ifndef DMABUFS_H
++#define DMABUFS_H
++
++struct dmabufs_ctl;
++struct dmabuf_h;
++
++struct dmabufs_ctl * dmabufs_ctl_new(void);
++void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
++
++// Need not preserve old contents
++// On NULL return old buffer is freed
++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
++
++static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
++ return dmabuf_realloc(dbsc, NULL, size);
++}
++/* Create from existing fd - dups(fd) */
++struct dmabuf_h * dmabuf_import(int fd, size_t size);
++void * dmabuf_map(struct dmabuf_h * const dh);
++
++/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
++
++int dmabuf_write_start(struct dmabuf_h * const dh);
++int dmabuf_write_end(struct dmabuf_h * const dh);
++int dmabuf_read_start(struct dmabuf_h * const dh);
++int dmabuf_read_end(struct dmabuf_h * const dh);
++
++int dmabuf_fd(const struct dmabuf_h * const dh);
++/* Allocated size */
++size_t dmabuf_size(const struct dmabuf_h * const dh);
++/* Bytes in use */
++size_t dmabuf_len(const struct dmabuf_h * const dh);
++/* Set bytes in use */
++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
++void dmabuf_free(struct dmabuf_h * dh);
++
++#endif
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v1.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 1
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v2.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 2
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v3.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 3
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v4.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 4
++#include "v4l2_req_hevc_vx.c"
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_vx.c
+@@ -0,0 +1,1365 @@
++// File included by v4l2_req_hevc_v* - not compiled on its own
++
++#include "decode.h"
++#include "hevcdec.h"
++#include "hwconfig.h"
++
++#if HEVC_CTRLS_VERSION == 1
++#include "hevc-ctrls-v1.h"
++
++// Fixup renamed entries
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
++
++#elif HEVC_CTRLS_VERSION == 2
++#include "hevc-ctrls-v2.h"
++#elif HEVC_CTRLS_VERSION == 3
++#include "hevc-ctrls-v3.h"
++#elif HEVC_CTRLS_VERSION == 4
++#include <linux/v4l2-controls.h>
++#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
++#include "hevc-ctrls-v4.h"
++#endif
++#else
++#error Unknown HEVC_CTRLS_VERSION
++#endif
++
++#ifndef V4L2_CID_STATELESS_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_SPS V4L2_CID_MPEG_VIDEO_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_PPS V4L2_CID_MPEG_VIDEO_HEVC_PPS
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
++#define V4L2_CID_STATELESS_HEVC_START_CODE V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
++
++#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
++#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
++#define V4L2_STATELESS_HEVC_START_CODE_NONE V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
++#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
++#endif
++
++// Should be in videodev2 but we might not have a good enough one
++#ifndef V4L2_PIX_FMT_HEVC_SLICE
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++#endif
++
++#include "v4l2_request_hevc.h"
++
++#include "libavutil/hwcontext_drm.h"
++
++#include <semaphore.h>
++#include <pthread.h>
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_utils.h"
++
++// Attached to buf[0] in frame
++// Pooled in hwcontext so generally create once - 1/frame
++typedef struct V4L2MediaReqDescriptor {
++ AVDRMFrameDescriptor drm;
++
++ // Media
++ uint64_t timestamp;
++ struct qent_dst * qe_dst;
++
++ // Decode only - should be NULL by the time we emit the frame
++ struct req_decode_ent decode_ent;
++
++ struct media_request *req;
++ struct qent_src *qe_src;
++
++#if HEVC_CTRLS_VERSION >= 2
++ struct v4l2_ctrl_hevc_decode_params dec;
++#endif
++
++ size_t num_slices;
++ size_t alloced_slices;
++ struct v4l2_ctrl_hevc_slice_params * slice_params;
++ struct slice_info * slices;
++
++ size_t num_offsets;
++ size_t alloced_offsets;
++ uint32_t *offsets;
++
++} V4L2MediaReqDescriptor;
++
++struct slice_info {
++ const uint8_t * ptr;
++ size_t len; // bytes
++ size_t n_offsets;
++};
++
++// Handy container for accumulating controls before setting
++struct req_controls {
++ int has_scaling;
++ struct timeval tv;
++ struct v4l2_ctrl_hevc_sps sps;
++ struct v4l2_ctrl_hevc_pps pps;
++ struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
++};
++
++//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
++
++
++// Get an FFmpeg format from the v4l2 format
++static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
++{
++ switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
++ format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
++ case V4L2_PIX_FMT_YUV420:
++ return AV_PIX_FMT_YUV420P;
++ case V4L2_PIX_FMT_NV12:
++ return AV_PIX_FMT_NV12;
++#if CONFIG_SAND
++ case V4L2_PIX_FMT_NV12_COL128:
++ return AV_PIX_FMT_RPI4_8;
++ case V4L2_PIX_FMT_NV12_10_COL128:
++ return AV_PIX_FMT_RPI4_10;
++#endif
++ default:
++ break;
++ }
++ return AV_PIX_FMT_NONE;
++}
++
++static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
++{
++ const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
++ return rd->timestamp;
++}
++
++static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
++{
++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
++ rd->timestamp = dpb_stamp;
++}
++
++static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
++{
++ int32_t luma_weight_denom, chroma_weight_denom;
++ const SliceHeader *sh = &h->sh;
++
++ if (sh->slice_type == HEVC_SLICE_I ||
++ (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
++ (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
++ return;
++
++ table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
++
++ if (h->ps.sps->chroma_format_idc)
++ table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
++
++ luma_weight_denom = (1 << sh->luma_log2_weight_denom);
++ chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
++
++ for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
++ table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
++ table->luma_offset_l0[i] = sh->luma_offset_l0[i];
++ table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
++ table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
++ table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
++ table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
++ }
++
++ if (sh->slice_type != HEVC_SLICE_B)
++ return;
++
++ for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
++ table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
++ table->luma_offset_l1[i] = sh->luma_offset_l1[i];
++ table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
++ table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
++ table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
++ table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
++ }
++}
++
++#if HEVC_CTRLS_VERSION <= 2
++static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
++{
++ const HEVCFrame *frame;
++ int i;
++
++ for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
++ frame = h->rps[ST_CURR_BEF].ref[i];
++ if (frame && timestamp == frame_capture_dpb(frame->frame))
++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
++ }
++
++ for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
++ frame = h->rps[ST_CURR_AFT].ref[i];
++ if (frame && timestamp == frame_capture_dpb(frame->frame))
++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
++ }
++
++ for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
++ frame = h->rps[LT_CURR].ref[i];
++ if (frame && timestamp == frame_capture_dpb(frame->frame))
++ return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
++ }
++
++ return 0;
++}
++#endif
++
++static unsigned int
++get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
++ const struct v4l2_hevc_dpb_entry * const entries,
++ const unsigned int num_entries)
++{
++ uint64_t timestamp;
++
++ if (!frame)
++ return 0;
++
++ timestamp = frame_capture_dpb(frame->frame);
++
++ for (unsigned int i = 0; i < num_entries; i++) {
++ if (entries[i].timestamp == timestamp)
++ return i;
++ }
++
++ return 0;
++}
++
++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
++{
++ unsigned int z = 0;
++ while (idx--) {
++ if (*b++ == 0) {
++ ++z;
++ if (z >= 2 && *b == 3) {
++ ++b;
++ z = 0;
++ }
++ }
++ else {
++ z = 0;
++ }
++ }
++ return b;
++}
++
++static int slice_add(V4L2MediaReqDescriptor * const rd)
++{
++ if (rd->num_slices >= rd->alloced_slices) {
++ struct v4l2_ctrl_hevc_slice_params * p2;
++ struct slice_info * s2;
++ size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
++
++ p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
++ if (p2 == NULL)
++ return AVERROR(ENOMEM);
++ rd->slice_params = p2;
++
++ s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
++ if (s2 == NULL)
++ return AVERROR(ENOMEM);
++ rd->slices = s2;
++
++ rd->alloced_slices = n2;
++ }
++ ++rd->num_slices;
++ return 0;
++}
++
++static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
++{
++ if (rd->num_offsets + n > rd->alloced_offsets) {
++ size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
++ void * p2;
++ while (rd->num_offsets + n > n2)
++ n2 *= 2;
++ if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
++ return AVERROR(ENOMEM);
++ rd->offsets = p2;
++ rd->alloced_offsets = n2;
++ }
++ for (size_t i = 0; i != n; ++i)
++ rd->offsets[rd->num_offsets++] = offsets[i] - 1;
++ return 0;
++}
++
++static unsigned int
++fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
++{
++ unsigned int i;
++ unsigned int n = 0;
++ const HEVCFrame * const pic = h->ref;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
++ const HEVCFrame * const frame = &h->DPB[i];
++ if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
++ struct v4l2_hevc_dpb_entry * const entry = entries + n++;
++
++ entry->timestamp = frame_capture_dpb(frame->frame);
++#if HEVC_CTRLS_VERSION <= 2
++ entry->rps = find_frame_rps_type(h, entry->timestamp);
++#else
++ entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
++ V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
++#endif
++ entry->field_pic = frame->frame->interlaced_frame;
++
++#if HEVC_CTRLS_VERSION <= 3
++ /* TODO: Interleaved: Get the POC for each field. */
++ entry->pic_order_cnt[0] = frame->poc;
++ entry->pic_order_cnt[1] = frame->poc;
++#else
++ entry->pic_order_cnt_val = frame->poc;
++#endif
++ }
++ }
++ return n;
++}
++
++static void fill_slice_params(const HEVCContext * const h,
++#if HEVC_CTRLS_VERSION >= 2
++ const struct v4l2_ctrl_hevc_decode_params * const dec,
++#endif
++ struct v4l2_ctrl_hevc_slice_params *slice_params,
++ uint32_t bit_size, uint32_t bit_offset)
++{
++ const SliceHeader * const sh = &h->sh;
++#if HEVC_CTRLS_VERSION >= 2
++ const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
++ const unsigned int dpb_n = dec->num_active_dpb_entries;
++#else
++ struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
++ unsigned int dpb_n;
++#endif
++ unsigned int i;
++ RefPicList *rpl;
++
++ *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
++ .bit_size = bit_size,
++#if HEVC_CTRLS_VERSION <= 3
++ .data_bit_offset = bit_offset,
++#else
++ .data_byte_offset = bit_offset / 8 + 1,
++#endif
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ .slice_segment_addr = sh->slice_segment_addr,
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++ .nal_unit_type = h->nal_unit_type,
++ .nuh_temporal_id_plus1 = h->temporal_id + 1,
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ .slice_type = sh->slice_type,
++ .colour_plane_id = sh->colour_plane_id,
++ .slice_pic_order_cnt = h->ref->poc,
++ .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
++ .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
++ .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
++ .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
++ .slice_qp_delta = sh->slice_qp_delta,
++ .slice_cb_qp_offset = sh->slice_cb_qp_offset,
++ .slice_cr_qp_offset = sh->slice_cr_qp_offset,
++ .slice_act_y_qp_offset = 0,
++ .slice_act_cb_qp_offset = 0,
++ .slice_act_cr_qp_offset = 0,
++ .slice_beta_offset_div2 = sh->beta_offset / 2,
++ .slice_tc_offset_div2 = sh->tc_offset / 2,
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++ .pic_struct = h->sei.picture_timing.picture_struct,
++
++#if HEVC_CTRLS_VERSION < 2
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++ .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++ .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++ .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++#endif
++ };
++
++ if (sh->slice_sample_adaptive_offset_flag[0])
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
++
++ if (sh->slice_sample_adaptive_offset_flag[1])
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
++
++ if (sh->slice_temporal_mvp_enabled_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
++
++ if (sh->mvd_l1_zero_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
++
++ if (sh->cabac_init_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
++
++ if (sh->collocated_list == L0)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
++
++ if (sh->disable_deblocking_filter_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
++
++ if (sh->slice_loop_filter_across_slices_enabled_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++ if (sh->dependent_slice_segment_flag)
++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
++
++#if HEVC_CTRLS_VERSION < 2
++ dpb_n = fill_dpb_entries(h, dpb);
++ slice_params->num_active_dpb_entries = dpb_n;
++#endif
++
++ if (sh->slice_type != HEVC_SLICE_I) {
++ rpl = &h->ref->refPicList[0];
++ for (i = 0; i < rpl->nb_refs; i++)
++ slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
++ }
++
++ if (sh->slice_type == HEVC_SLICE_B) {
++ rpl = &h->ref->refPicList[1];
++ for (i = 0; i < rpl->nb_refs; i++)
++ slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
++ }
++
++ fill_pred_table(h, &slice_params->pred_weight_table);
++
++ slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
++#if HEVC_CTRLS_VERSION <= 3
++ if (slice_params->num_entry_point_offsets > 256) {
++ slice_params->num_entry_point_offsets = 256;
++ av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
++ }
++
++ for (i = 0; i < slice_params->num_entry_point_offsets; i++)
++ slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
++#endif
++}
++
++#if HEVC_CTRLS_VERSION >= 2
++static void
++fill_decode_params(const HEVCContext * const h,
++ struct v4l2_ctrl_hevc_decode_params * const dec)
++{
++ unsigned int i;
++
++ *dec = (struct v4l2_ctrl_hevc_decode_params){
++ .pic_order_cnt_val = h->poc,
++ .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++ .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++ .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++ };
++
++ dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
++
++ // The docn does seem to ask that we fit our 32 bit signed POC into
++ // a U8 so... (To be fair 16 bits would be enough)
++ // Luckily we (Pi) don't use these fields
++ for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
++ dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
++ for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
++ dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
++ for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
++ dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
++
++ if (IS_IRAP(h))
++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
++ if (IS_IDR(h))
++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
++ if (h->sh.no_output_of_prior_pics_flag)
++ dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
++
++}
++#endif
++
++static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
++{
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++ *ctrl = (struct v4l2_ctrl_hevc_sps) {
++ .chroma_format_idc = sps->chroma_format_idc,
++ .pic_width_in_luma_samples = sps->width,
++ .pic_height_in_luma_samples = sps->height,
++ .bit_depth_luma_minus8 = sps->bit_depth - 8,
++ .bit_depth_chroma_minus8 = sps->bit_depth - 8,
++ .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
++ .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
++ .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
++ .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
++ .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
++ .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
++ .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
++ .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
++ .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
++ .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
++ .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
++ .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
++ .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
++ .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
++ .num_short_term_ref_pic_sets = sps->nb_st_rps,
++ .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
++ .chroma_format_idc = sps->chroma_format_idc,
++ .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
++ };
++
++ if (sps->separate_colour_plane_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
++
++ if (sps->scaling_list_enable_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
++
++ if (sps->amp_enabled_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
++
++ if (sps->sao_enabled)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
++
++ if (sps->pcm_enabled_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
++
++ if (sps->pcm.loop_filter_disable_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
++
++ if (sps->long_term_ref_pics_present_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
++
++ if (sps->sps_temporal_mvp_enabled_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
++
++ if (sps->sps_strong_intra_smoothing_enable_flag)
++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
++}
++
++static void fill_scaling_matrix(const ScalingList * const sl,
++ struct v4l2_ctrl_hevc_scaling_matrix * const sm)
++{
++ unsigned int i;
++
++ for (i = 0; i < 6; i++) {
++ unsigned int j;
++
++ for (j = 0; j < 16; j++)
++ sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
++ for (j = 0; j < 64; j++) {
++ sm->scaling_list_8x8[i][j] = sl->sl[1][i][j];
++ sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
++ if (i < 2)
++ sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
++ }
++ sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
++ if (i < 2)
++ sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
++ }
++}
++
++static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
++{
++ uint64_t flags = 0;
++
++ if (pps->dependent_slice_segments_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
++
++ if (pps->output_flag_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
++
++ if (pps->sign_data_hiding_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
++
++ if (pps->cabac_init_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
++
++ if (pps->constrained_intra_pred_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
++
++ if (pps->transform_skip_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
++
++ if (pps->cu_qp_delta_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
++
++ if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
++
++ if (pps->weighted_pred_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
++
++ if (pps->weighted_bipred_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
++
++ if (pps->transquant_bypass_enable_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
++
++ if (pps->tiles_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
++
++ if (pps->entropy_coding_sync_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
++
++ if (pps->loop_filter_across_tiles_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
++
++ if (pps->seq_loop_filter_across_slices_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++ if (pps->deblocking_filter_override_enabled_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
++
++ if (pps->disable_dbf)
++ flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
++
++ if (pps->lists_modification_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
++
++ if (pps->slice_header_extension_present_flag)
++ flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
++
++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++ *ctrl = (struct v4l2_ctrl_hevc_pps) {
++ .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
++ .init_qp_minus26 = pps->pic_init_qp_minus26,
++ .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
++ .pps_cb_qp_offset = pps->cb_qp_offset,
++ .pps_cr_qp_offset = pps->cr_qp_offset,
++ .pps_beta_offset_div2 = pps->beta_offset / 2,
++ .pps_tc_offset_div2 = pps->tc_offset / 2,
++ .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
++ .flags = flags
++ };
++
++
++ if (pps->tiles_enabled_flag) {
++ ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
++ ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
++
++ for (int i = 0; i < pps->num_tile_columns; i++)
++ ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
++
++ for (int i = 0; i < pps->num_tile_rows; i++)
++ ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
++ }
++}
++
++// Called before finally returning the frame to the user
++// Set corrupt flag here as this is actually the frame structure that
++// is going to the user (in MT land each thread has its own pool)
++static int frame_post_process(void *logctx, AVFrame *frame)
++{
++ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
++
++// av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
++ frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
++ if (rd->qe_dst) {
++ MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
++ if (stat != MEDIABUFS_STATUS_SUCCESS) {
++ av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
++ frame->flags |= AV_FRAME_FLAG_CORRUPT;
++ }
++ }
++
++ return 0;
++}
++
++static inline struct timeval cvt_dpb_to_tv(uint64_t t)
++{
++ t /= 1000;
++ return (struct timeval){
++ .tv_usec = t % 1000000,
++ .tv_sec = t / 1000000
++ };
++}
++
++static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
++{
++ return (uint64_t)t * 1000;
++}
++
++static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
++ av_unused const uint8_t *buffer,
++ av_unused uint32_t size)
++{
++ const HEVCContext *h = avctx->priv_data;
++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++// av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
++ decode_q_add(&ctx->decode_q, &rd->decode_ent);
++
++ rd->num_slices = 0;
++ ctx->timestamp++;
++ rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
++
++ {
++ FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
++ fdd->post_process = frame_post_process;
++ }
++
++ // qe_dst needs to be bound to the data buffer and only returned when that is
++ if (!rd->qe_dst)
++ {
++ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
++
++ return 0;
++}
++
++// Object fd & size will be zapped by this & need setting later
++static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
++{
++ AVDRMLayerDescriptor *layer = &desc->layers[0];
++ unsigned int width;
++ unsigned int height;
++ unsigned int bpl;
++ uint32_t pixelformat;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++ width = format->fmt.pix_mp.width;
++ height = format->fmt.pix_mp.height;
++ pixelformat = format->fmt.pix_mp.pixelformat;
++ bpl = format->fmt.pix_mp.plane_fmt[0].bytesperline;
++ }
++ else {
++ width = format->fmt.pix.width;
++ height = format->fmt.pix.height;
++ pixelformat = format->fmt.pix.pixelformat;
++ bpl = format->fmt.pix.bytesperline;
++ }
++
++ switch (pixelformat) {
++ case V4L2_PIX_FMT_NV12:
++ layer->format = DRM_FORMAT_NV12;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ break;
++#if CONFIG_SAND
++ case V4L2_PIX_FMT_NV12_COL128:
++ layer->format = DRM_FORMAT_NV12;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
++ break;
++ case V4L2_PIX_FMT_NV12_10_COL128:
++ layer->format = DRM_FORMAT_P030;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
++ break;
++#endif
++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
++ case V4L2_PIX_FMT_SUNXI_TILED_NV12:
++ layer->format = DRM_FORMAT_NV12;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
++ break;
++#endif
++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
++ case V4L2_PIX_FMT_NV15:
++ layer->format = DRM_FORMAT_NV15;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ break;
++#endif
++ case V4L2_PIX_FMT_NV16:
++ layer->format = DRM_FORMAT_NV16;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ break;
++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
++ case V4L2_PIX_FMT_NV20:
++ layer->format = DRM_FORMAT_NV20;
++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++ break;
++#endif
++ default:
++ return -1;
++ }
++
++ desc->nb_objects = 1;
++ desc->objects[0].fd = -1;
++ desc->objects[0].size = 0;
++
++ desc->nb_layers = 1;
++ layer->nb_planes = 2;
++
++ layer->planes[0].object_index = 0;
++ layer->planes[0].offset = 0;
++ layer->planes[0].pitch = bpl;
++#if CONFIG_SAND
++ if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = height * 128;
++ layer->planes[0].pitch = width;
++ layer->planes[1].pitch = width;
++ }
++ else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = height * 128;
++ layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
++ layer->planes[1].pitch = width * 2;
++ }
++ else
++#endif
++ {
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = layer->planes[0].pitch * height;
++ layer->planes[1].pitch = layer->planes[0].pitch;
++ }
++
++ return 0;
++}
++
++static int
++set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
++ struct req_controls *const controls,
++#if HEVC_CTRLS_VERSION >= 2
++ struct v4l2_ctrl_hevc_decode_params * const dec,
++#endif
++ struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
++ void * const offsets, const size_t offset_count)
++{
++ int rv;
++#if HEVC_CTRLS_VERSION >= 2
++ unsigned int n = 3;
++#else
++ unsigned int n = 2;
++#endif
++
++ struct v4l2_ext_control control[6] = {
++ {
++ .id = V4L2_CID_STATELESS_HEVC_SPS,
++ .ptr = &controls->sps,
++ .size = sizeof(controls->sps),
++ },
++ {
++ .id = V4L2_CID_STATELESS_HEVC_PPS,
++ .ptr = &controls->pps,
++ .size = sizeof(controls->pps),
++ },
++#if HEVC_CTRLS_VERSION >= 2
++ {
++ .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
++ .ptr = dec,
++ .size = sizeof(*dec),
++ },
++#endif
++ };
++
++ if (slices)
++ control[n++] = (struct v4l2_ext_control) {
++ .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
++ .ptr = slices,
++ .size = sizeof(*slices) * slice_count,
++ };
++
++ if (controls->has_scaling)
++ control[n++] = (struct v4l2_ext_control) {
++ .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
++ .ptr = &controls->scaling_matrix,
++ .size = sizeof(controls->scaling_matrix),
++ };
++
++#if HEVC_CTRLS_VERSION >= 4
++ if (offsets)
++ control[n++] = (struct v4l2_ext_control) {
++ .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
++ .ptr = offsets,
++ .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
++ };
++#endif
++
++ rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
++
++ return rv;
++}
++
++// This only works because we started out from a single coded frame buffer
++// that will remain intact until after end_frame
++static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++ const HEVCContext * const h = avctx->priv_data;
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
++ int bcount = get_bits_count(&h->HEVClc->gb);
++ uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
++
++ const unsigned int n = rd->num_slices;
++ const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
++
++ int rv;
++ struct slice_info * si;
++
++ // This looks dodgy but we know that FFmpeg has parsed this from a buffer
++ // that contains the entire frame including the start code
++ if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
++ buffer -= 3;
++ size += 3;
++ boff += 24;
++ if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
++ av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
++ buffer[0], buffer[1], buffer[2]);
++ }
++ }
++
++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
++ if (rd->slices == NULL) {
++ if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
++ return AVERROR(ENOMEM);
++ rd->slices->ptr = buffer;
++ rd->num_slices = 1;
++ }
++ rd->slices->len = buffer - rd->slices->ptr + size;
++ return 0;
++ }
++
++ if ((rv = slice_add(rd)) != 0)
++ return rv;
++
++ si = rd->slices + n;
++ si->ptr = buffer;
++ si->len = size;
++ si->n_offsets = rd->num_offsets;
++
++ if (n != block_start) {
++ struct slice_info *const si0 = rd->slices + block_start;
++ const size_t offset = (buffer - si0->ptr);
++ boff += offset * 8;
++ size += offset;
++ si0->len = si->len + offset;
++ }
++
++#if HEVC_CTRLS_VERSION >= 2
++ if (n == 0)
++ fill_decode_params(h, &rd->dec);
++ fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
++#else
++ fill_slice_params(h, rd->slice_params + n, size * 8, boff);
++#endif
++ if (ctx->max_offsets != 0 &&
++ (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
++ return rv;
++
++ return 0;
++}
++
++static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
++{
++ const HEVCContext * const h = avctx->priv_data;
++ if (h->ref != NULL) {
++ V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++ media_request_abort(&rd->req);
++ mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
++
++ decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++ }
++}
++
++static int send_slice(AVCodecContext * const avctx,
++ V4L2MediaReqDescriptor * const rd,
++ struct req_controls *const controls,
++ const unsigned int i, const unsigned int j)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++ const int is_last = (j == rd->num_slices);
++ struct slice_info *const si = rd->slices + i;
++ struct media_request * req = NULL;
++ struct qent_src * src = NULL;
++ MediaBufsStatus stat;
++ void * offsets = rd->offsets + rd->slices[i].n_offsets;
++ size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
++
++ if ((req = media_request_get(ctx->mpool)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
++ return AVERROR(ENOMEM);
++ }
++
++ if (set_req_ctls(ctx, req,
++ controls,
++#if HEVC_CTRLS_VERSION >= 2
++ &rd->dec,
++#endif
++ rd->slice_params + i, j - i,
++ offsets, n_offsets)) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
++ goto fail1;
++ }
++
++ if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
++ goto fail1;
++ }
++
++ if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
++ goto fail2;
++ }
++
++ if (qent_src_params_set(src, &controls->tv)) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
++ goto fail2;
++ }
++
++ stat = mediabufs_start_request(ctx->mbufs, &req, &src,
++ i == 0 ? rd->qe_dst : NULL,
++ is_last);
++
++ if (stat != MEDIABUFS_STATUS_SUCCESS) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
++ return AVERROR_UNKNOWN;
++ }
++ return 0;
++
++fail2:
++ mediabufs_src_qent_abort(ctx->mbufs, &src);
++fail1:
++ media_request_abort(&req);
++ return AVERROR_UNKNOWN;
++}
++
++static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
++{
++ const HEVCContext * const h = avctx->priv_data;
++ V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++ struct req_controls rc;
++ unsigned int i;
++ int rv;
++
++ // It is possible, though maybe a bug, to get an end_frame without
++ // a previous start_frame. If we do then give up.
++ if (!decode_q_in_q(&rd->decode_ent)) {
++ av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
++ return AVERROR_INVALIDDATA;
++ }
++
++ {
++ const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
++ &h->ps.pps->scaling_list :
++ h->ps.sps->scaling_list_enable_flag ?
++ &h->ps.sps->scaling_list : NULL;
++
++
++ memset(&rc, 0, sizeof(rc));
++ rc.tv = cvt_dpb_to_tv(rd->timestamp);
++ fill_sps(&rc.sps, h->ps.sps);
++ fill_pps(&rc.pps, h->ps.pps);
++ if (sl) {
++ rc.has_scaling = 1;
++ fill_scaling_matrix(sl, &rc.scaling_matrix);
++ }
++ }
++
++ decode_q_wait(&ctx->decode_q, &rd->decode_ent);
++
++ // qe_dst needs to be bound to the data buffer and only returned when that is
++ // Alloc almost certainly wants to be serialised if there is any chance of blocking
++ // so we get the next frame to be free in the thread that needs it for decode first.
++ //
++ // In our current world this probably isn't a concern but put it here anyway
++ if (!rd->qe_dst)
++ {
++ if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
++ rv = AVERROR(ENOMEM);
++ goto fail;
++ }
++ }
++
++ // Send as slices
++ for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
++ const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
++ if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
++ goto fail;
++ }
++
++ // Set the drm_prime desriptor
++ drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
++ rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
++ rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
++
++ decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++ return 0;
++
++fail:
++ decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++ return rv;
++}
++
++static inline int
++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
++{
++ return v >= c->minimum && v <= c->maximum;
++}
++
++// Initial check & init
++static int
++probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
++{
++ const HEVCContext *h = avctx->priv_data;
++ const HEVCSPS * const sps = h->ps.sps;
++ struct v4l2_ctrl_hevc_sps ctrl_sps;
++ unsigned int i;
++
++ // Check for var slice array
++ struct v4l2_query_ext_ctrl qc[] = {
++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++ { .id = V4L2_CID_STATELESS_HEVC_SPS },
++ { .id = V4L2_CID_STATELESS_HEVC_PPS },
++ { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
++#if HEVC_CTRLS_VERSION >= 2
++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
++#endif
++ };
++ // Order & size must match!
++ static const size_t ctrl_sizes[] = {
++ sizeof(struct v4l2_ctrl_hevc_slice_params),
++ sizeof(int32_t),
++ sizeof(struct v4l2_ctrl_hevc_sps),
++ sizeof(struct v4l2_ctrl_hevc_pps),
++ sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
++#if HEVC_CTRLS_VERSION >= 2
++ sizeof(struct v4l2_ctrl_hevc_decode_params),
++#endif
++ };
++ const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
++
++#if HEVC_CTRLS_VERSION == 2
++ if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++ return AVERROR(EINVAL);
++#elif HEVC_CTRLS_VERSION == 3
++ if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++ return AVERROR(EINVAL);
++#endif
++
++ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
++ i = 0;
++#if HEVC_CTRLS_VERSION >= 4
++ // Skip slice check if no slice mode
++ if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++ i = 1;
++#else
++ // Fail frame mode silently for anything prior to V4
++ if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++ return AVERROR(EINVAL);
++#endif
++ for (; i != noof_ctrls; ++i) {
++ if (qc[i].type == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
++ return AVERROR(EINVAL);
++ }
++ if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
++ av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
++ HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
++ return AVERROR(EINVAL);
++ }
++ }
++
++ fill_sps(&ctrl_sps, sps);
++
++ if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
++ return AVERROR(EINVAL);
++ }
++
++ return 0;
++}
++
++// Final init
++static int
++set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
++{
++ int ret;
++
++ struct v4l2_query_ext_ctrl querys[] = {
++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++ { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
++#if HEVC_CTRLS_VERSION >= 4
++ { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
++#endif
++ };
++
++ struct v4l2_ext_control ctrls[] = {
++ { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++ { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++ };
++
++ mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
++
++ ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
++ querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
++ 1 : querys[2].dims[0];
++ av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
++
++#if HEVC_CTRLS_VERSION >= 4
++ ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
++ 0 : querys[3].dims[0];
++ av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
++#else
++ ctx->max_offsets = 0;
++#endif
++
++ if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
++ querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
++ ctx->decode_mode = querys[0].default_value;
++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
++ else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++ ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
++ else {
++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
++ return AVERROR(EINVAL);
++ }
++
++ if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
++ querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
++ ctx->start_code = querys[1].default_value;
++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
++ else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++ else {
++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
++ return AVERROR(EINVAL);
++ }
++
++ // If we are in slice mode & START_CODE_NONE supported then pick that
++ // as it doesn't require the slightly dodgy look backwards in our raw buffer
++ if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
++ ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++ ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++
++ ctrls[0].value = ctx->decode_mode;
++ ctrls[1].value = ctx->start_code;
++
++ ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
++ return !ret ? 0 : AVERROR(-ret);
++}
++
++static void v4l2_req_frame_free(void *opaque, uint8_t *data)
++{
++ AVCodecContext *avctx = opaque;
++ V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
++
++ av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
++
++ qent_dst_unref(&rd->qe_dst);
++
++ // We don't expect req or qe_src to be set
++ if (rd->req || rd->qe_src)
++ av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
++
++ av_freep(&rd->slices);
++ av_freep(&rd->slice_params);
++ av_freep(&rd->offsets);
++
++ av_free(rd);
++}
++
++static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
++{
++ AVCodecContext *avctx = opaque;
++// V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++// V4L2MediaReqDescriptor *req;
++ AVBufferRef *ref;
++ uint8_t *data;
++// int ret;
++
++ data = av_mallocz(size);
++ if (!data)
++ return NULL;
++
++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
++ ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
++ if (!ref) {
++ av_freep(&data);
++ return NULL;
++ }
++ return ref;
++}
++
++#if 0
++static void v4l2_req_pool_free(void *opaque)
++{
++ av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
++}
++
++static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
++{
++ av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
++
++ av_buffer_pool_uninit(&hwfc->pool);
++}
++#endif
++
++static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++ AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
++ const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
++
++ hwfc->format = AV_PIX_FMT_DRM_PRIME;
++ hwfc->sw_format = pixel_format_from_format(vfmt);
++ if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
++ hwfc->width = vfmt->fmt.pix_mp.width;
++ hwfc->height = vfmt->fmt.pix_mp.height;
++ } else {
++ hwfc->width = vfmt->fmt.pix.width;
++ hwfc->height = vfmt->fmt.pix.height;
++ }
++#if 0
++ hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
++ if (!hwfc->pool)
++ return AVERROR(ENOMEM);
++
++ hwfc->free = v4l2_req_hwframe_ctx_free;
++
++ hwfc->initial_pool_size = 1;
++
++ switch (avctx->codec_id) {
++ case AV_CODEC_ID_VP9:
++ hwfc->initial_pool_size += 8;
++ break;
++ case AV_CODEC_ID_VP8:
++ hwfc->initial_pool_size += 3;
++ break;
++ default:
++ hwfc->initial_pool_size += 2;
++ }
++#endif
++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
++
++ return 0;
++}
++
++static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++ int rv;
++
++ frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
++ if (!frame->buf[0])
++ return AVERROR(ENOMEM);
++
++ frame->data[0] = frame->buf[0]->data;
++
++ frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
++
++ if ((rv = ff_attach_decode_data(frame)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
++ av_frame_unref(frame);
++ return rv;
++ }
++
++ return 0;
++}
++
++const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
++ .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
++ .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
++ .probe = probe,
++ .set_controls = set_controls,
++
++ .start_frame = v4l2_request_hevc_start_frame,
++ .decode_slice = v4l2_request_hevc_decode_slice,
++ .end_frame = v4l2_request_hevc_end_frame,
++ .abort_frame = v4l2_request_hevc_abort_frame,
++ .frame_params = frame_params,
++ .alloc_frame = alloc_frame,
++};
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_media.c
+@@ -0,0 +1,1601 @@
++/*
++ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include <errno.h>
++#include <fcntl.h>
++#include <poll.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++#include <linux/media.h>
++#include <sys/ioctl.h>
++#include <sys/select.h>
++#include <sys/ioctl.h>
++
++#include <linux/videodev2.h>
++
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_utils.h"
++#include "weak_link.h"
++
++
++/* floor(log2(x)) */
++static unsigned int log2_size(size_t x)
++{
++ unsigned int n = 0;
++
++ if (x & ~0xffff) {
++ n += 16;
++ x >>= 16;
++ }
++ if (x & ~0xff) {
++ n += 8;
++ x >>= 8;
++ }
++ if (x & ~0xf) {
++ n += 4;
++ x >>= 4;
++ }
++ if (x & ~3) {
++ n += 2;
++ x >>= 2;
++ }
++ return (x & ~1) ? n + 1 : n;
++}
++
++static size_t round_up_size(const size_t x)
++{
++ /* Admit no size < 256 */
++ const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
++
++ return x >= (3 << n) ? 4 << n : (3 << n);
++}
++
++struct media_request;
++
++struct media_pool {
++ int fd;
++ sem_t sem;
++ pthread_mutex_t lock;
++ struct media_request * free_reqs;
++ struct pollqueue * pq;
++};
++
++struct media_request {
++ struct media_request * next;
++ struct media_pool * mp;
++ int fd;
++ struct polltask * pt;
++};
++
++
++static inline int do_trywait(sem_t *const sem)
++{
++ while (sem_trywait(sem)) {
++ if (errno != EINTR)
++ return -errno;
++ }
++ return 0;
++}
++
++static inline int do_wait(sem_t *const sem)
++{
++ while (sem_wait(sem)) {
++ if (errno != EINTR)
++ return -errno;
++ }
++ return 0;
++}
++
++static int request_buffers(int video_fd, unsigned int type,
++ enum v4l2_memory memory, unsigned int buffers_count)
++{
++ struct v4l2_requestbuffers buffers;
++ int rc;
++
++ memset(&buffers, 0, sizeof(buffers));
++ buffers.type = type;
++ buffers.memory = memory;
++ buffers.count = buffers_count;
++
++ rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
++ if (rc < 0) {
++ rc = -errno;
++ request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
++ return rc;
++ }
++
++ return 0;
++}
++
++
++static int set_stream(int video_fd, unsigned int type, bool enable)
++{
++ enum v4l2_buf_type buf_type = type;
++ int rc;
++
++ rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
++ &buf_type);
++ if (rc < 0) {
++ rc = -errno;
++ request_log("Unable to %sable stream: %s\n",
++ enable ? "en" : "dis", strerror(-rc));
++ return rc;
++ }
++
++ return 0;
++}
++
++
++
++struct media_request * media_request_get(struct media_pool * const mp)
++{
++ struct media_request *req = NULL;
++
++ /* Timeout handled by poll code */
++ if (do_wait(&mp->sem))
++ return NULL;
++
++ pthread_mutex_lock(&mp->lock);
++ req = mp->free_reqs;
++ if (req) {
++ mp->free_reqs = req->next;
++ req->next = NULL;
++ }
++ pthread_mutex_unlock(&mp->lock);
++ return req;
++}
++
++int media_request_fd(const struct media_request * const req)
++{
++ return req->fd;
++}
++
++int media_request_start(struct media_request * const req)
++{
++ while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
++ {
++ const int err = errno;
++ if (err == EINTR)
++ continue;
++ request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
++ return -err;
++ }
++
++ pollqueue_add_task(req->pt, 2000);
++ return 0;
++}
++
++static void media_request_done(void *v, short revents)
++{
++ struct media_request *const req = v;
++ struct media_pool *const mp = req->mp;
++
++ /* ** Not sure what to do about timeout */
++
++ if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
++ request_log("Unable to reinit media request: %s\n",
++ strerror(errno));
++
++ pthread_mutex_lock(&mp->lock);
++ req->next = mp->free_reqs;
++ mp->free_reqs = req;
++ pthread_mutex_unlock(&mp->lock);
++ sem_post(&mp->sem);
++}
++
++int media_request_abort(struct media_request ** const preq)
++{
++ struct media_request * const req = *preq;
++
++ if (req == NULL)
++ return 0;
++ *preq = NULL;
++
++ media_request_done(req, 0);
++ return 0;
++}
++
++static void delete_req_chain(struct media_request * const chain)
++{
++ struct media_request * next = chain;
++ while (next) {
++ struct media_request * const req = next;
++ next = req->next;
++ if (req->pt)
++ polltask_delete(&req->pt);
++ if (req->fd != -1)
++ close(req->fd);
++ free(req);
++ }
++}
++
++struct media_pool * media_pool_new(const char * const media_path,
++ struct pollqueue * const pq,
++ const unsigned int n)
++{
++ struct media_pool * const mp = calloc(1, sizeof(*mp));
++ unsigned int i;
++
++ if (!mp)
++ goto fail0;
++
++ mp->pq = pq;
++ pthread_mutex_init(&mp->lock, NULL);
++ mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
++ if (mp->fd == -1) {
++ request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
++ goto fail1;
++ }
++
++ for (i = 0; i != n; ++i) {
++ struct media_request * req = malloc(sizeof(*req));
++ if (!req)
++ goto fail4;
++
++ *req = (struct media_request){
++ .next = mp->free_reqs,
++ .mp = mp,
++ .fd = -1
++ };
++ mp->free_reqs = req;
++
++ if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
++ request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
++ goto fail4;
++ }
++
++ req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
++ if (!req->pt)
++ goto fail4;
++ }
++
++ sem_init(&mp->sem, 0, n);
++
++ return mp;
++
++fail4:
++ delete_req_chain(mp->free_reqs);
++ close(mp->fd);
++ pthread_mutex_destroy(&mp->lock);
++fail1:
++ free(mp);
++fail0:
++ return NULL;
++}
++
++void media_pool_delete(struct media_pool ** pMp)
++{
++ struct media_pool * const mp = *pMp;
++
++ if (!mp)
++ return;
++ *pMp = NULL;
++
++ delete_req_chain(mp->free_reqs);
++ close(mp->fd);
++ sem_destroy(&mp->sem);
++ pthread_mutex_destroy(&mp->lock);
++ free(mp);
++}
++
++
++#define INDEX_UNSET (~(uint32_t)0)
++
++enum qent_status {
++ QENT_NEW = 0, // Initial state - shouldn't last
++ QENT_FREE, // On free chain
++ QENT_PENDING, // User has ent
++ QENT_WAITING, // On inuse
++ QENT_DONE, // Frame rx
++ QENT_ERROR, // Error
++ QENT_IMPORT
++};
++
++struct qent_base {
++ atomic_int ref_count;
++ struct qent_base *next;
++ struct qent_base *prev;
++ enum qent_status status;
++ uint32_t index;
++ struct dmabuf_h *dh[VIDEO_MAX_PLANES];
++ struct timeval timestamp;
++};
++
++struct qent_src {
++ struct qent_base base;
++ int fixed_size;
++};
++
++struct qent_dst {
++ struct qent_base base;
++ bool waiting;
++ pthread_mutex_t lock;
++ pthread_cond_t cond;
++ struct ff_weak_link_client * mbc_wl;
++};
++
++struct qe_list_head {
++ struct qent_base *head;
++ struct qent_base *tail;
++};
++
++struct buf_pool {
++ pthread_mutex_t lock;
++ sem_t free_sem;
++ enum v4l2_buf_type buf_type;
++ struct qe_list_head free;
++ struct qe_list_head inuse;
++};
++
++
++static inline struct qent_dst *base_to_dst(struct qent_base *be)
++{
++ return (struct qent_dst *)be;
++}
++
++static inline struct qent_src *base_to_src(struct qent_base *be)
++{
++ return (struct qent_src *)be;
++}
++
++
++#define QENT_BASE_INITIALIZER {\
++ .ref_count = ATOMIC_VAR_INIT(0),\
++ .status = QENT_NEW,\
++ .index = INDEX_UNSET\
++}
++
++static void qe_base_uninit(struct qent_base *const be)
++{
++ unsigned int i;
++ for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
++ dmabuf_free(be->dh[i]);
++ be->dh[i] = NULL;
++ }
++}
++
++static void qe_src_free(struct qent_src *const be_src)
++{
++ if (!be_src)
++ return;
++ qe_base_uninit(&be_src->base);
++ free(be_src);
++}
++
++static struct qent_src * qe_src_new(void)
++{
++ struct qent_src *const be_src = malloc(sizeof(*be_src));
++ if (!be_src)
++ return NULL;
++ *be_src = (struct qent_src){
++ .base = QENT_BASE_INITIALIZER
++ };
++ return be_src;
++}
++
++static void qe_dst_free(struct qent_dst *const be_dst)
++{
++ if (!be_dst)
++ return;
++
++ ff_weak_link_unref(&be_dst->mbc_wl);
++ pthread_cond_destroy(&be_dst->cond);
++ pthread_mutex_destroy(&be_dst->lock);
++ qe_base_uninit(&be_dst->base);
++ free(be_dst);
++}
++
++static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
++{
++ struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
++ if (!be_dst)
++ return NULL;
++ *be_dst = (struct qent_dst){
++ .base = QENT_BASE_INITIALIZER,
++ .lock = PTHREAD_MUTEX_INITIALIZER,
++ .cond = PTHREAD_COND_INITIALIZER,
++ .mbc_wl = ff_weak_link_ref(wl)
++ };
++ return be_dst;
++}
++
++static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
++{
++ if (ql->tail)
++ ql->tail->next = be;
++ else
++ ql->head = be;
++ be->prev = ql->tail;
++ be->next = NULL;
++ ql->tail = be;
++}
++
++static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
++{
++ if (!be)
++ return NULL;
++
++ if (be->next)
++ be->next->prev = be->prev;
++ else
++ ql->tail = be->prev;
++ if (be->prev)
++ be->prev->next = be->next;
++ else
++ ql->head = be->next;
++ be->next = NULL;
++ be->prev = NULL;
++ return be;
++}
++
++
++static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
++{
++ ql_add_tail(&bp->free, be);
++}
++
++static struct qent_base * bq_get_free(struct buf_pool *const bp)
++{
++ return ql_extract(&bp->free, bp->free.head);
++}
++
++static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
++{
++ return ql_extract(&bp->inuse, be);
++}
++
++static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
++{
++ return ql_extract(&bp->inuse, bp->inuse.head);
++}
++
++static void bq_free_all_free_src(struct buf_pool *const bp)
++{
++ struct qent_base *be;
++ while ((be = bq_get_free(bp)) != NULL)
++ qe_src_free(base_to_src(be));
++}
++
++static void bq_free_all_inuse_src(struct buf_pool *const bp)
++{
++ struct qent_base *be;
++ while ((be = bq_get_inuse(bp)) != NULL)
++ qe_src_free(base_to_src(be));
++}
++
++static void bq_free_all_free_dst(struct buf_pool *const bp)
++{
++ struct qent_base *be;
++ while ((be = bq_get_free(bp)) != NULL)
++ qe_dst_free(base_to_dst(be));
++}
++
++static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
++{
++ unsigned int i;
++
++ pthread_mutex_lock(&bp->lock);
++ /* Clear out state vars */
++ be->timestamp.tv_sec = 0;
++ be->timestamp.tv_usec = 0;
++ be->status = QENT_FREE;
++ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
++ dmabuf_len_set(be->dh[i], 0);
++ bq_put_free(bp, be);
++ pthread_mutex_unlock(&bp->lock);
++ sem_post(&bp->free_sem);
++}
++
++static bool queue_is_inuse(const struct buf_pool *const bp)
++{
++ return bp->inuse.tail != NULL;
++}
++
++static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
++{
++ if (!be)
++ return;
++ pthread_mutex_lock(&bp->lock);
++ ql_add_tail(&bp->inuse, be);
++ be->status = QENT_WAITING;
++ pthread_mutex_unlock(&bp->lock);
++}
++
++static struct qent_base *queue_get_free(struct buf_pool *const bp)
++{
++ struct qent_base *buf;
++
++ if (do_wait(&bp->free_sem))
++ return NULL;
++ pthread_mutex_lock(&bp->lock);
++ buf = bq_get_free(bp);
++ pthread_mutex_unlock(&bp->lock);
++ return buf;
++}
++
++static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
++{
++ struct qent_base *buf;
++
++ if (do_trywait(&bp->free_sem))
++ return NULL;
++ pthread_mutex_lock(&bp->lock);
++ buf = bq_get_free(bp);
++ pthread_mutex_unlock(&bp->lock);
++ return buf;
++}
++
++static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
++{
++ struct qent_base *be;
++
++ pthread_mutex_lock(&bp->lock);
++ /* Expect 1st in Q, but allow anywhere */
++ for (be = bp->inuse.head; be; be = be->next) {
++ if (dmabuf_fd(be->dh[0]) == fd) {
++ bq_extract_inuse(bp, be);
++ break;
++ }
++ }
++ pthread_mutex_unlock(&bp->lock);
++
++ return be;
++}
++
++static void queue_delete(struct buf_pool *const bp)
++{
++ sem_destroy(&bp->free_sem);
++ pthread_mutex_destroy(&bp->lock);
++ free(bp);
++}
++
++static struct buf_pool* queue_new(const int vfd)
++{
++ struct buf_pool *bp = calloc(1, sizeof(*bp));
++ if (!bp)
++ return NULL;
++ pthread_mutex_init(&bp->lock, NULL);
++ sem_init(&bp->free_sem, 0, 0);
++ return bp;
++}
++
++
++struct mediabufs_ctl {
++ atomic_int ref_count; /* 0 is single ref for easier atomics */
++ void * dc;
++ int vfd;
++ bool stream_on;
++ bool polling;
++ bool dst_fixed; // Dst Q is fixed size
++ pthread_mutex_t lock;
++ struct buf_pool * src;
++ struct buf_pool * dst;
++ struct polltask * pt;
++ struct pollqueue * pq;
++ struct ff_weak_link_master * this_wlm;
++
++ struct v4l2_format src_fmt;
++ struct v4l2_format dst_fmt;
++ struct v4l2_capability capability;
++};
++
++static int qe_v4l2_queue(struct qent_base *const be,
++ const int vfd, struct media_request *const mreq,
++ const struct v4l2_format *const fmt,
++ const bool is_dst, const bool hold_flag)
++{
++ struct v4l2_buffer buffer = {
++ .type = fmt->type,
++ .memory = V4L2_MEMORY_DMABUF,
++ .index = be->index
++ };
++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ unsigned int i;
++ for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++ if (is_dst)
++ dmabuf_len_set(be->dh[i], 0);
++
++ /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
++ planes[i].length = dmabuf_size(be->dh[i]);
++ planes[i].bytesused = dmabuf_len(be->dh[i]);
++ planes[i].m.fd = dmabuf_fd(be->dh[i]);
++ }
++ buffer.m.planes = planes;
++ buffer.length = i;
++ }
++ else {
++ if (is_dst)
++ dmabuf_len_set(be->dh[0], 0);
++
++ buffer.bytesused = dmabuf_len(be->dh[0]);
++ buffer.length = dmabuf_size(be->dh[0]);
++ buffer.m.fd = dmabuf_fd(be->dh[0]);
++ }
++
++ if (!is_dst && mreq) {
++ buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
++ buffer.request_fd = media_request_fd(mreq);
++ if (hold_flag)
++ buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
++ }
++
++ if (is_dst)
++ be->timestamp = (struct timeval){0,0};
++
++ buffer.timestamp = be->timestamp;
++
++ while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
++ const int err = errno;
++ if (err != EINTR) {
++ request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
++ return -err;
++ }
++ }
++ return 0;
++}
++
++static struct qent_base * qe_dequeue(struct buf_pool *const bp,
++ const int vfd,
++ const struct v4l2_format * const f)
++{
++ int fd;
++ struct qent_base *be;
++ int rc;
++ const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
++ struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++ struct v4l2_buffer buffer = {
++ .type = f->type,
++ .memory = V4L2_MEMORY_DMABUF
++ };
++ if (mp) {
++ buffer.length = f->fmt.pix_mp.num_planes;
++ buffer.m.planes = planes;
++ }
++
++ while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
++ errno == EINTR)
++ /* Loop */;
++ if (rc) {
++ request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
++ return NULL;
++ }
++
++ fd = mp ? planes[0].m.fd : buffer.m.fd;
++ be = queue_find_extract_fd(bp, fd);
++ if (!be) {
++ request_log("Failed to find fd %d in Q\n", fd);
++ return NULL;
++ }
++
++ be->timestamp = buffer.timestamp;
++ be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
++ return be;
++}
++
++static void qe_dst_done(struct qent_dst * dst_be)
++{
++ pthread_mutex_lock(&dst_be->lock);
++ dst_be->waiting = false;
++ pthread_cond_broadcast(&dst_be->cond);
++ pthread_mutex_unlock(&dst_be->lock);
++
++ qent_dst_unref(&dst_be);
++}
++
++static bool qe_dst_waiting(struct qent_dst *const dst_be)
++{
++ bool waiting;
++ pthread_mutex_lock(&dst_be->lock);
++ waiting = dst_be->waiting;
++ dst_be->waiting = true;
++ pthread_mutex_unlock(&dst_be->lock);
++ return waiting;
++}
++
++
++static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
++{
++ return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
++}
++
++static void mediabufs_poll_cb(void * v, short revents)
++{
++ struct mediabufs_ctl *mbc = v;
++ struct qent_src *src_be = NULL;
++ struct qent_dst *dst_be = NULL;
++
++ if (!revents)
++ request_err(mbc->dc, "%s: Timeout\n", __func__);
++
++ pthread_mutex_lock(&mbc->lock);
++ mbc->polling = false;
++
++ if ((revents & POLLOUT) != 0)
++ src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
++ if ((revents & POLLIN) != 0)
++ dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
++
++ /* Reschedule */
++ if (mediabufs_wants_poll(mbc)) {
++ mbc->polling = true;
++ pollqueue_add_task(mbc->pt, 2000);
++ }
++ pthread_mutex_unlock(&mbc->lock);
++
++ if (src_be)
++ queue_put_free(mbc->src, &src_be->base);
++ if (dst_be)
++ qe_dst_done(dst_be);
++}
++
++int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
++{
++ struct qent_base *const be = &be_src->base;
++
++ be->timestamp = *timestamp;
++ return 0;
++}
++
++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
++{
++ return be_dst->base.timestamp;
++}
++
++static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
++{
++ if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
++ size_t newsize = round_up_size(len);
++ request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
++ if (!dbsc) {
++ request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
++ return -ENOMEM;
++ }
++ if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
++ request_log("%s: Realloc %zd failed\n", __func__, newsize);
++ return -ENOMEM;
++ }
++ }
++ return 0;
++}
++
++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
++{
++ struct qent_base *const be = &be_src->base;
++ return qent_base_realloc(be, len, dbsc);
++}
++
++
++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
++{
++ void * dst;
++ struct qent_base *const be = &be_src->base;
++ int rv;
++
++ // Realloc doesn't copy so don't alloc if offset != 0
++ if ((rv = qent_base_realloc(be, offset + len,
++ be_src->fixed_size || offset ? NULL : dbsc)) != 0)
++ return rv;
++
++ dmabuf_write_start(be->dh[0]);
++ dst = dmabuf_map(be->dh[0]);
++ if (!dst)
++ return -1;
++ memcpy((char*)dst + offset, src, len);
++ dmabuf_len_set(be->dh[0], len);
++ dmabuf_write_end(be->dh[0]);
++ return 0;
++}
++
++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
++{
++ const struct qent_base *const be = &be_dst->base;
++
++ return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
++}
++
++int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
++{
++ return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
++}
++
++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
++ struct media_request **const pmreq,
++ struct qent_src **const psrc_be,
++ struct qent_dst *const dst_be,
++ const bool is_final)
++{
++ struct media_request * mreq = *pmreq;
++ struct qent_src *const src_be = *psrc_be;
++
++ // Req & src are always both "consumed"
++ *pmreq = NULL;
++ *psrc_be = NULL;
++
++ pthread_mutex_lock(&mbc->lock);
++
++ if (!src_be)
++ goto fail1;
++
++ if (dst_be) {
++ if (qe_dst_waiting(dst_be)) {
++ request_info(mbc->dc, "Request buffer already waiting on start\n");
++ goto fail1;
++ }
++ dst_be->base.timestamp = (struct timeval){0,0};
++ if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
++ goto fail1;
++
++ qent_dst_ref(dst_be);
++ queue_put_inuse(mbc->dst, &dst_be->base);
++ }
++
++ if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
++ goto fail1;
++ queue_put_inuse(mbc->src, &src_be->base);
++
++ if (!mbc->polling && mediabufs_wants_poll(mbc)) {
++ mbc->polling = true;
++ pollqueue_add_task(mbc->pt, 2000);
++ }
++ pthread_mutex_unlock(&mbc->lock);
++
++ if (media_request_start(mreq))
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++ return MEDIABUFS_STATUS_SUCCESS;
++
++fail1:
++ media_request_abort(&mreq);
++ if (src_be)
++ queue_put_free(mbc->src, &src_be->base);
++
++// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
++ if (dst_be) {
++ dst_be->base.status = QENT_ERROR;
++ qe_dst_done(dst_be);
++ }
++ pthread_mutex_unlock(&mbc->lock);
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++
++static int qe_alloc_from_fmt(struct qent_base *const be,
++ struct dmabufs_ctl *const dbsc,
++ const struct v4l2_format *const fmt)
++{
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ unsigned int i;
++ for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
++ be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
++ fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
++ /* On failure tidy up and die */
++ if (!be->dh[i]) {
++ while (i--) {
++ dmabuf_free(be->dh[i]);
++ be->dh[i] = NULL;
++ }
++ return -1;
++ }
++ }
++ }
++ else {
++// be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
++ size_t size = fmt->fmt.pix.sizeimage;
++ be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
++ if (!be->dh[0])
++ return -1;
++ }
++ return 0;
++}
++
++static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
++ const enum v4l2_buf_type buftype,
++ uint32_t pixfmt,
++ const unsigned int width, const unsigned int height,
++ const size_t bufsize)
++{
++ *fmt = (struct v4l2_format){.type = buftype};
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
++ fmt->fmt.pix_mp.width = width;
++ fmt->fmt.pix_mp.height = height;
++ fmt->fmt.pix_mp.pixelformat = pixfmt;
++ if (bufsize) {
++ fmt->fmt.pix_mp.num_planes = 1;
++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
++ }
++ }
++ else {
++ fmt->fmt.pix.width = width;
++ fmt->fmt.pix.height = height;
++ fmt->fmt.pix.pixelformat = pixfmt;
++ fmt->fmt.pix.sizeimage = bufsize;
++ }
++
++ while (ioctl(fd, VIDIOC_S_FMT, fmt))
++ if (errno != EINTR)
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++ // Treat anything where we don't get at least what we asked for as a fail
++ if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
++ if (fmt->fmt.pix_mp.width < width ||
++ fmt->fmt.pix_mp.height < height ||
++ fmt->fmt.pix_mp.pixelformat != pixfmt) {
++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++ }
++ }
++ else {
++ if (fmt->fmt.pix.width < width ||
++ fmt->fmt.pix.height < height ||
++ fmt->fmt.pix.pixelformat != pixfmt) {
++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++ }
++ }
++
++ return MEDIABUFS_STATUS_SUCCESS;
++}
++
++static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
++ const int fd,
++ const unsigned int type_v4l2,
++ const uint32_t flags_must,
++ const uint32_t flags_not,
++ const unsigned int width,
++ const unsigned int height,
++ mediabufs_dst_fmt_accept_fn *const accept_fn,
++ void *const accept_v)
++{
++ unsigned int i;
++
++ for (i = 0;; ++i) {
++ struct v4l2_fmtdesc fmtdesc = {
++ .index = i,
++ .type = type_v4l2
++ };
++ while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
++ if (errno != EINTR)
++ return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++ }
++ if ((fmtdesc.flags & flags_must) != flags_must ||
++ (fmtdesc.flags & flags_not))
++ continue;
++ if (!accept_fn(accept_v, &fmtdesc))
++ continue;
++
++ if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
++ width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
++ return MEDIABUFS_STATUS_SUCCESS;
++ }
++ return 0;
++}
++
++
++/* Wait for qent done */
++
++MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
++{
++ struct qent_base *const be = &be_dst->base;
++ enum qent_status estat;
++
++ pthread_mutex_lock(&be_dst->lock);
++ while (be_dst->waiting &&
++ !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
++ /* Loop */;
++ estat = be->status;
++ pthread_mutex_unlock(&be_dst->lock);
++
++ return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
++ estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
++ MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
++{
++ struct qent_base *const be = &be_dst->base;
++ return dmabuf_map(be->dh[buf_no]);
++}
++
++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
++{
++ struct qent_base *const be = &be_dst->base;
++ unsigned int i;
++ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++ if (dmabuf_read_start(be->dh[i])) {
++ while (i--)
++ dmabuf_read_end(be->dh[i]);
++ return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++ }
++ }
++ return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
++{
++ struct qent_base *const be = &be_dst->base;
++ unsigned int i;
++ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
++
++ for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++ if (dmabuf_read_end(be->dh[i]))
++ status = MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++ return status;
++}
++
++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
++{
++ if (be_dst)
++ atomic_fetch_add(&be_dst->base.ref_count, 1);
++ return be_dst;
++}
++
++void qent_dst_unref(struct qent_dst ** const pbe_dst)
++{
++ struct qent_dst * const be_dst = *pbe_dst;
++ struct mediabufs_ctl * mbc;
++ if (!be_dst)
++ return;
++ *pbe_dst = NULL;
++
++ if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
++ return;
++
++ if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
++ queue_put_free(mbc->dst, &be_dst->base);
++ ff_weak_link_unlock(be_dst->mbc_wl);
++ }
++ else {
++ qe_dst_free(be_dst);
++ }
++}
++
++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
++ unsigned int plane,
++ int fd, size_t size)
++{
++ struct qent_base *const be = &be_dst->base;
++ struct dmabuf_h * dh;
++
++ if (be->status != QENT_IMPORT || be->dh[plane])
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++ dh = dmabuf_import(fd, size);
++ if (!dh)
++ return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++
++ be->dh[plane] = dh;
++ return MEDIABUFS_STATUS_SUCCESS;
++}
++
++// Returns noof buffers created, -ve for error
++static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
++{
++ unsigned int i;
++
++ struct v4l2_create_buffers cbuf = {
++ .count = n,
++ .memory = V4L2_MEMORY_DMABUF,
++ .format = mbc->dst_fmt,
++ };
++
++ while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
++ const int err = -errno;
++ if (err != EINTR) {
++ request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
++ return -err;
++ }
++ }
++
++ if (cbuf.count != n)
++ request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
++
++ for (i = 0; i != cbuf.count; ++i)
++ qes[i]->base.index = cbuf.index + i;
++
++ return cbuf.count;
++}
++
++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
++{
++ struct qent_dst * be_dst;
++
++ if (mbc == NULL) {
++ be_dst = qe_dst_new(NULL);
++ if (be_dst)
++ be_dst->base.status = QENT_IMPORT;
++ return be_dst;
++ }
++
++ if (mbc->dst_fixed) {
++ be_dst = base_to_dst(queue_get_free(mbc->dst));
++ if (!be_dst)
++ return NULL;
++ }
++ else {
++ be_dst = base_to_dst(queue_tryget_free(mbc->dst));
++ if (!be_dst) {
++ be_dst = qe_dst_new(mbc->this_wlm);
++ if (!be_dst)
++ return NULL;
++
++ if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
++ qe_dst_free(be_dst);
++ return NULL;
++ }
++ }
++ }
++
++ if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
++ /* Given how create buf works we can't uncreate it on alloc failure
++ * all we can do is put it on the free Q
++ */
++ queue_put_free(mbc->dst, &be_dst->base);
++ return NULL;
++ }
++
++ be_dst->base.status = QENT_PENDING;
++ atomic_store(&be_dst->base.ref_count, 0);
++ return be_dst;
++}
++
++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
++{
++ return &mbc->dst_fmt;
++}
++
++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
++ const unsigned int width,
++ const unsigned int height,
++ mediabufs_dst_fmt_accept_fn *const accept_fn,
++ void *const accept_v)
++{
++ MediaBufsStatus status;
++ unsigned int i;
++ const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
++ static const struct {
++ unsigned int flags_must;
++ unsigned int flags_not;
++ } trys[] = {
++ {0, V4L2_FMT_FLAG_EMULATED},
++ {V4L2_FMT_FLAG_EMULATED, 0},
++ };
++ for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
++ status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
++ buf_type,
++ trys[i].flags_must,
++ trys[i].flags_not,
++ width, height, accept_fn, accept_v);
++ if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
++ return status;
++ }
++
++ if (status != MEDIABUFS_STATUS_SUCCESS)
++ return status;
++
++ /* Try to create a buffer - don't alloc */
++ return status;
++}
++
++// ** This is a mess if we get partial alloc but without any way to remove
++// individual V4L2 Q members we are somewhat stuffed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
++{
++ unsigned int i;
++ int a = 0;
++ unsigned int qc;
++ struct qent_dst * qes[32];
++
++ if (n > 32)
++ return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++
++ // Create qents first as it is hard to get rid of the V4L2 buffers on error
++ for (qc = 0; qc != n; ++qc)
++ {
++ if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
++ goto fail;
++ }
++
++ if ((a = create_dst_bufs(mbc, n, qes)) < 0)
++ goto fail;
++
++ for (i = 0; i != a; ++i)
++ queue_put_free(mbc->dst, &qes[i]->base);
++
++ if (a != n)
++ goto fail;
++
++ mbc->dst_fixed = fixed;
++ return MEDIABUFS_STATUS_SUCCESS;
++
++fail:
++ for (i = (a < 0 ? 0 : a); i != qc; ++i)
++ qe_dst_free(qes[i]);
++
++ return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++}
++
++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
++{
++ struct qent_base * buf = queue_get_free(mbc->src);
++ buf->status = QENT_PENDING;
++ return base_to_src(buf);
++}
++
++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
++{
++ struct qent_src *const qe_src = *pqe_src;
++ if (!qe_src)
++ return;
++ *pqe_src = NULL;
++ queue_put_free(mbc->src, &qe_src->base);
++}
++
++/* src format must have been set up before this */
++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
++ struct dmabufs_ctl * const dbsc,
++ unsigned int n)
++{
++ unsigned int i;
++ struct v4l2_requestbuffers req = {
++ .count = n,
++ .type = mbc->src_fmt.type,
++ .memory = V4L2_MEMORY_DMABUF
++ };
++
++ bq_free_all_free_src(mbc->src);
++ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
++ if (errno != EINTR) {
++ request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++ }
++
++ if (n > req.count) {
++ request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
++ n = req.count;
++ }
++
++ for (i = 0; i != n; ++i) {
++ struct qent_src *const be_src = qe_src_new();
++ if (!be_src) {
++ request_err(mbc->dc, "Failed to create src be %d\n", i);
++ goto fail;
++ }
++ if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
++ qe_src_free(be_src);
++ goto fail;
++ }
++ be_src->base.index = i;
++ be_src->fixed_size = !mediabufs_src_resizable(mbc);
++
++ queue_put_free(mbc->src, &be_src->base);
++ }
++
++ return MEDIABUFS_STATUS_SUCCESS;
++
++fail:
++ bq_free_all_free_src(mbc->src);
++ req.count = 0;
++ while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
++ errno == EINTR)
++ /* Loop */;
++
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++
++
++/*
++ * Set stuff order:
++ * Set src fmt
++ * Set parameters (sps) on vfd
++ * Negotiate dst format (dst_fmt_set)
++ * Create src buffers
++ * Alloc a dst buffer or Create dst slots
++*/
++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
++{
++ if (mbc->stream_on)
++ return MEDIABUFS_STATUS_SUCCESS;
++
++ if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
++ request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++
++ if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
++ request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
++ set_stream(mbc->vfd, mbc->src_fmt.type, false);
++ return MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++
++ mbc->stream_on = true;
++ return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
++{
++ MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
++
++ if (!mbc->stream_on)
++ return MEDIABUFS_STATUS_SUCCESS;
++
++ if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
++ request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
++ status = MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++
++ if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
++ request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
++ status = MEDIABUFS_ERROR_OPERATION_FAILED;
++ }
++
++ mbc->stream_on = false;
++ return status;
++}
++
++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
++{
++ struct v4l2_ext_controls controls = {
++ .controls = control_array,
++ .count = n
++ };
++
++ if (mreq) {
++ controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
++ controls.request_fd = media_request_fd(mreq);
++ }
++
++ while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
++ {
++ const int err = errno;
++ if (err != EINTR) {
++ request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
++ return -err;
++ }
++ }
++
++ return 0;
++}
++
++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
++ struct media_request * const mreq,
++ unsigned int id, void *data,
++ unsigned int size)
++{
++ struct v4l2_ext_control control = {
++ .id = id,
++ .ptr = data,
++ .size = size
++ };
++
++ int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
++ return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
++ enum v4l2_buf_type buf_type,
++ const uint32_t pixfmt,
++ const uint32_t width, const uint32_t height,
++ const size_t bufsize)
++{
++ MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
++ if (rv != MEDIABUFS_STATUS_SUCCESS)
++ request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
++
++ return rv;
++}
++
++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
++{
++ int rv = 0;
++ while (n--) {
++ while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
++ const int err = errno;
++ if (err != EINTR) {
++ // Often used for probing - errors are to be expected
++ request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
++ ctrls->type = 0; // 0 is invalid
++ rv = -err;
++ break;
++ }
++ }
++ ++ctrls;
++ }
++ return rv;
++}
++
++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
++{
++ // Single planar OUTPUT can only take exact size buffers
++ // Multiplanar will take larger than negotiated
++ return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
++}
++
++static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
++{
++ if (!mbc)
++ return;
++
++ // Break the weak link first
++ ff_weak_link_break(&mbc->this_wlm);
++
++ polltask_delete(&mbc->pt);
++
++ mediabufs_stream_off(mbc);
++
++ // Empty v4l2 buffer stash
++ request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
++ request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
++
++ bq_free_all_free_src(mbc->src);
++ bq_free_all_inuse_src(mbc->src);
++ bq_free_all_free_dst(mbc->dst);
++
++ {
++ struct qent_dst *dst_be;
++ while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
++ dst_be->base.timestamp = (struct timeval){0};
++ dst_be->base.status = QENT_ERROR;
++ qe_dst_done(dst_be);
++ }
++ }
++
++ queue_delete(mbc->dst);
++ queue_delete(mbc->src);
++ close(mbc->vfd);
++ pthread_mutex_destroy(&mbc->lock);
++
++ free(mbc);
++}
++
++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
++{
++ atomic_fetch_add(&mbc->ref_count, 1);
++ return mbc;
++}
++
++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
++{
++ struct mediabufs_ctl *const mbc = *pmbc;
++ int n;
++
++ if (!mbc)
++ return;
++ *pmbc = NULL;
++ n = atomic_fetch_sub(&mbc->ref_count, 1);
++ if (n)
++ return;
++ mediabufs_ctl_delete(mbc);
++}
++
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
++{
++ return mbc->capability.version;
++}
++
++static int set_capabilities(struct mediabufs_ctl *const mbc)
++{
++ uint32_t caps;
++
++ if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
++ int err = errno;
++ request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
++ return -err;
++ }
++
++ caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
++ mbc->capability.device_caps :
++ mbc->capability.capabilities;
++
++ if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
++ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
++ }
++ else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
++ mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++ mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++ }
++ else {
++ request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
++/* One of these per context */
++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
++{
++ struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
++
++ if (!mbc)
++ return NULL;
++
++ mbc->dc = dc;
++ // Default mono planar
++ mbc->pq = pq;
++ pthread_mutex_init(&mbc->lock, NULL);
++
++ /* Pick a default - could we scan for this? */
++ if (vpath == NULL)
++ vpath = "/dev/media0";
++
++ while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
++ {
++ const int err = errno;
++ if (err != EINTR) {
++ request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
++ goto fail0;
++ }
++ }
++
++ if (set_capabilities(mbc)) {
++ request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
++ goto fail1;
++ }
++
++ mbc->src = queue_new(mbc->vfd);
++ if (!mbc->src)
++ goto fail1;
++ mbc->dst = queue_new(mbc->vfd);
++ if (!mbc->dst)
++ goto fail2;
++ mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
++ if (!mbc->pt)
++ goto fail3;
++ mbc->this_wlm = ff_weak_link_new(mbc);
++ if (!mbc->this_wlm)
++ goto fail4;
++
++ /* Cannot add polltask now - polling with nothing pending
++ * generates infinite error polls
++ */
++ return mbc;
++
++fail4:
++ polltask_delete(&mbc->pt);
++fail3:
++ queue_delete(mbc->dst);
++fail2:
++ queue_delete(mbc->src);
++fail1:
++ close(mbc->vfd);
++fail0:
++ free(mbc);
++ request_info(dc, "%s: FAILED\n", __func__);
++ return NULL;
++}
++
++
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_media.h
+@@ -0,0 +1,154 @@
++/*
++e.h
++*
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef _MEDIA_H_
++#define _MEDIA_H_
++
++#include <stdbool.h>
++#include <stdint.h>
++
++struct v4l2_format;
++struct v4l2_fmtdesc;
++struct v4l2_query_ext_ctrl;
++
++struct pollqueue;
++struct media_request;
++struct media_pool;
++
++typedef enum media_buf_status {
++ MEDIABUFS_STATUS_SUCCESS = 0,
++ MEDIABUFS_ERROR_OPERATION_FAILED,
++ MEDIABUFS_ERROR_DECODING_ERROR,
++ MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
++ MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
++ MEDIABUFS_ERROR_ALLOCATION_FAILED,
++} MediaBufsStatus;
++
++struct media_pool * media_pool_new(const char * const media_path,
++ struct pollqueue * const pq,
++ const unsigned int n);
++void media_pool_delete(struct media_pool ** pmp);
++
++// Obtain a media request
++// Will block if none availible - has a 2sec timeout
++struct media_request * media_request_get(struct media_pool * const mp);
++int media_request_fd(const struct media_request * const req);
++
++// Start this request
++// Request structure is returned to pool once done
++int media_request_start(struct media_request * const req);
++
++// Return an *unstarted* media_request to the pool
++// May later be upgraded to allow for aborting a started req
++int media_request_abort(struct media_request ** const preq);
++
++
++struct mediabufs_ctl;
++struct qent_src;
++struct qent_dst;
++struct dmabuf_h;
++struct dmabufs_ctl;
++
++int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
++
++// prealloc
++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
++// dbsc may be NULL if realloc not required
++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
++int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
++MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
++void qent_dst_delete(struct qent_dst *const be);
++// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
++void qent_dst_unref(struct qent_dst ** const pbe_dst);
++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
++
++const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
++/* Import an fd unattached to any mediabuf */
++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
++ unsigned int plane,
++ int fd, size_t size);
++
++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
++ struct media_request **const pmreq,
++ struct qent_src **const psrc_be,
++ struct qent_dst *const dst_be,
++ const bool is_final);
++// Get / alloc a dst buffer & associate with a slot
++// If the dst pool is empty then behaviour depends on the fixed flag passed to
++// dst_slots_create. Default is !fixed = unlimited alloc
++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
++ struct dmabufs_ctl *const dbsc);
++// Create dst slots without alloc
++// If fixed true then qent_alloc will only get slots from this pool and will
++// block until a qent has been unrefed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
++
++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
++
++typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
++
++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
++ const unsigned int width,
++ const unsigned int height,
++ mediabufs_dst_fmt_accept_fn *const accept_fn,
++ void *const accept_v);
++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
++
++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
++ struct v4l2_ext_control control_array[], unsigned int n);
++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
++ struct media_request * const mreq,
++ unsigned int id, void *data,
++ unsigned int size);
++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
++
++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
++
++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
++ enum v4l2_buf_type buf_type,
++ const uint32_t pixfmt,
++ const uint32_t width, const uint32_t height,
++ const size_t bufsize);
++
++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
++ struct dmabufs_ctl * const dbsc,
++ unsigned int n);
++
++#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
++
++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
++ const char *vpath, struct pollqueue *const pq);
++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
++
++
++#endif
+--- /dev/null
++++ b/libavcodec/v4l2_req_pollqueue.c
+@@ -0,0 +1,361 @@
++#include <errno.h>
++#include <limits.h>
++#include <poll.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <string.h>
++#include <unistd.h>
++#include <sys/eventfd.h>
++
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_utils.h"
++
++
++struct pollqueue;
++
++enum polltask_state {
++ POLLTASK_UNQUEUED = 0,
++ POLLTASK_QUEUED,
++ POLLTASK_RUNNING,
++ POLLTASK_Q_KILL,
++ POLLTASK_RUN_KILL,
++};
++
++struct polltask {
++ struct polltask *next;
++ struct polltask *prev;
++ struct pollqueue *q;
++ enum polltask_state state;
++
++ int fd;
++ short events;
++
++ void (*fn)(void *v, short revents);
++ void * v;
++
++ uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
++ sem_t kill_sem;
++};
++
++struct pollqueue {
++ atomic_int ref_count;
++ pthread_mutex_t lock;
++
++ struct polltask *head;
++ struct polltask *tail;
++
++ bool kill;
++ bool no_prod;
++ int prod_fd;
++ struct polltask *prod_pt;
++ pthread_t worker;
++};
++
++struct polltask *polltask_new(struct pollqueue *const pq,
++ const int fd, const short events,
++ void (*const fn)(void *v, short revents),
++ void *const v)
++{
++ struct polltask *pt;
++
++ if (!events)
++ return NULL;
++
++ pt = malloc(sizeof(*pt));
++ if (!pt)
++ return NULL;
++
++ *pt = (struct polltask){
++ .next = NULL,
++ .prev = NULL,
++ .q = pollqueue_ref(pq),
++ .fd = fd,
++ .events = events,
++ .fn = fn,
++ .v = v
++ };
++
++ sem_init(&pt->kill_sem, 0, 0);
++
++ return pt;
++}
++
++static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
++{
++ if (pt->prev)
++ pt->prev->next = pt->next;
++ else
++ pq->head = pt->next;
++ if (pt->next)
++ pt->next->prev = pt->prev;
++ else
++ pq->tail = pt->prev;
++ pt->next = NULL;
++ pt->prev = NULL;
++}
++
++static void polltask_free(struct polltask * const pt)
++{
++ sem_destroy(&pt->kill_sem);
++ free(pt);
++}
++
++static int pollqueue_prod(const struct pollqueue *const pq)
++{
++ static const uint64_t one = 1;
++ return write(pq->prod_fd, &one, sizeof(one));
++}
++
++void polltask_delete(struct polltask **const ppt)
++{
++ struct polltask *const pt = *ppt;
++ struct pollqueue * pq;
++ enum polltask_state state;
++ bool prodme;
++
++ if (!pt)
++ return;
++
++ pq = pt->q;
++ pthread_mutex_lock(&pq->lock);
++ state = pt->state;
++ pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
++ prodme = !pq->no_prod;
++ pthread_mutex_unlock(&pq->lock);
++
++ if (state != POLLTASK_UNQUEUED) {
++ if (prodme)
++ pollqueue_prod(pq);
++ while (sem_wait(&pt->kill_sem) && errno == EINTR)
++ /* loop */;
++ }
++
++ // Leave zapping the ref until we have DQed the PT as might well be
++ // legitimately used in it
++ *ppt = NULL;
++ polltask_free(pt);
++ pollqueue_unref(&pq);
++}
++
++static uint64_t pollqueue_now(int timeout)
++{
++ struct timespec now;
++ uint64_t now_ms;
++
++ if (clock_gettime(CLOCK_MONOTONIC, &now))
++ return 0;
++ now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
++ return now_ms ? now_ms : (uint64_t)1;
++}
++
++void pollqueue_add_task(struct polltask *const pt, const int timeout)
++{
++ bool prodme = false;
++ struct pollqueue * const pq = pt->q;
++
++ pthread_mutex_lock(&pq->lock);
++ if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
++ if (pq->tail)
++ pq->tail->next = pt;
++ else
++ pq->head = pt;
++ pt->prev = pq->tail;
++ pt->next = NULL;
++ pt->state = POLLTASK_QUEUED;
++ pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
++ pq->tail = pt;
++ prodme = !pq->no_prod;
++ }
++ pthread_mutex_unlock(&pq->lock);
++ if (prodme)
++ pollqueue_prod(pq);
++}
++
++static void *poll_thread(void *v)
++{
++ struct pollqueue *const pq = v;
++ struct pollfd *a = NULL;
++ size_t asize = 0;
++
++ pthread_mutex_lock(&pq->lock);
++ do {
++ unsigned int i;
++ unsigned int n = 0;
++ struct polltask *pt;
++ struct polltask *pt_next;
++ uint64_t now = pollqueue_now(0);
++ int timeout = -1;
++ int rv;
++
++ for (pt = pq->head; pt; pt = pt_next) {
++ int64_t t;
++
++ pt_next = pt->next;
++
++ if (pt->state == POLLTASK_Q_KILL) {
++ pollqueue_rem_task(pq, pt);
++ sem_post(&pt->kill_sem);
++ continue;
++ }
++
++ if (n >= asize) {
++ asize = asize ? asize * 2 : 4;
++ a = realloc(a, asize * sizeof(*a));
++ if (!a) {
++ request_log("Failed to realloc poll array to %zd\n", asize);
++ goto fail_locked;
++ }
++ }
++
++ a[n++] = (struct pollfd){
++ .fd = pt->fd,
++ .events = pt->events
++ };
++
++ t = (int64_t)(pt->timeout - now);
++ if (pt->timeout && t < INT_MAX &&
++ (timeout < 0 || (int)t < timeout))
++ timeout = (t < 0) ? 0 : (int)t;
++ }
++ pthread_mutex_unlock(&pq->lock);
++
++ if ((rv = poll(a, n, timeout)) == -1) {
++ if (errno != EINTR) {
++ request_log("Poll error: %s\n", strerror(errno));
++ goto fail_unlocked;
++ }
++ }
++
++ pthread_mutex_lock(&pq->lock);
++ now = pollqueue_now(0);
++
++ /* Prodding in this loop is pointless and might lead to
++ * infinite looping
++ */
++ pq->no_prod = true;
++ for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
++ pt_next = pt->next;
++
++ /* Pending? */
++ if (a[i].revents ||
++ (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
++ pollqueue_rem_task(pq, pt);
++ if (pt->state == POLLTASK_QUEUED)
++ pt->state = POLLTASK_RUNNING;
++ if (pt->state == POLLTASK_Q_KILL)
++ pt->state = POLLTASK_RUN_KILL;
++ pthread_mutex_unlock(&pq->lock);
++
++ /* This can add new entries to the Q but as
++ * those are added to the tail our existing
++ * chain remains intact
++ */
++ pt->fn(pt->v, a[i].revents);
++
++ pthread_mutex_lock(&pq->lock);
++ if (pt->state == POLLTASK_RUNNING)
++ pt->state = POLLTASK_UNQUEUED;
++ if (pt->state == POLLTASK_RUN_KILL)
++ sem_post(&pt->kill_sem);
++ }
++ }
++ pq->no_prod = false;
++
++ } while (!pq->kill);
++
++fail_locked:
++ pthread_mutex_unlock(&pq->lock);
++fail_unlocked:
++ free(a);
++ return NULL;
++}
++
++static void prod_fn(void *v, short revents)
++{
++ struct pollqueue *const pq = v;
++ char buf[8];
++ if (revents)
++ read(pq->prod_fd, buf, 8);
++ if (!pq->kill)
++ pollqueue_add_task(pq->prod_pt, -1);
++}
++
++struct pollqueue * pollqueue_new(void)
++{
++ struct pollqueue *pq = malloc(sizeof(*pq));
++ if (!pq)
++ return NULL;
++ *pq = (struct pollqueue){
++ .ref_count = ATOMIC_VAR_INIT(0),
++ .lock = PTHREAD_MUTEX_INITIALIZER,
++ .head = NULL,
++ .tail = NULL,
++ .kill = false,
++ .prod_fd = -1
++ };
++
++ pq->prod_fd = eventfd(0, EFD_NONBLOCK);
++ if (pq->prod_fd == 1)
++ goto fail1;
++ pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
++ if (!pq->prod_pt)
++ goto fail2;
++ pollqueue_add_task(pq->prod_pt, -1);
++ if (pthread_create(&pq->worker, NULL, poll_thread, pq))
++ goto fail3;
++ // Reset ref count which will have been inced by the add_task
++ atomic_store(&pq->ref_count, 0);
++ return pq;
++
++fail3:
++ polltask_free(pq->prod_pt);
++fail2:
++ close(pq->prod_fd);
++fail1:
++ free(pq);
++ return NULL;
++}
++
++static void pollqueue_free(struct pollqueue *const pq)
++{
++ void *rv;
++
++ pthread_mutex_lock(&pq->lock);
++ pq->kill = true;
++ pollqueue_prod(pq);
++ pthread_mutex_unlock(&pq->lock);
++
++ pthread_join(pq->worker, &rv);
++ polltask_free(pq->prod_pt);
++ pthread_mutex_destroy(&pq->lock);
++ close(pq->prod_fd);
++ free(pq);
++}
++
++struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
++{
++ atomic_fetch_add(&pq->ref_count, 1);
++ return pq;
++}
++
++void pollqueue_unref(struct pollqueue **const ppq)
++{
++ struct pollqueue * const pq = *ppq;
++
++ if (!pq)
++ return;
++ *ppq = NULL;
++
++ if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
++ return;
++
++ pollqueue_free(pq);
++}
++
++
++
+--- /dev/null
++++ b/libavcodec/v4l2_req_pollqueue.h
+@@ -0,0 +1,18 @@
++#ifndef POLLQUEUE_H_
++#define POLLQUEUE_H_
++
++struct polltask;
++struct pollqueue;
++
++struct polltask *polltask_new(struct pollqueue *const pq,
++ const int fd, const short events,
++ void (*const fn)(void *v, short revents),
++ void *const v);
++void polltask_delete(struct polltask **const ppt);
++
++void pollqueue_add_task(struct polltask *const pt, const int timeout);
++struct pollqueue * pollqueue_new(void);
++void pollqueue_unref(struct pollqueue **const ppq);
++struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
++
++#endif /* POLLQUEUE_H_ */
+--- /dev/null
++++ b/libavcodec/v4l2_req_utils.h
+@@ -0,0 +1,22 @@
++#include "libavutil/log.h"
++
++#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
++
++#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
++#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
++#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
++#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
++
++static inline char safechar(char c) {
++ return c > 0x20 && c < 0x7f ? c : '.';
++}
++
++static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
++ tbuf[0] = safechar((fcc >> 0) & 0xff);
++ tbuf[1] = safechar((fcc >> 8) & 0xff);
++ tbuf[2] = safechar((fcc >> 16) & 0xff);
++ tbuf[3] = safechar((fcc >> 24) & 0xff);
++ tbuf[4] = '\0';
++ return tbuf;
++}
++
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -0,0 +1,315 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++
++#include "decode.h"
++#include "hevcdec.h"
++#include "hwconfig.h"
++
++#include "v4l2_request_hevc.h"
++
++#include "libavutil/hwcontext_drm.h"
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_utils.h"
++
++static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
++{
++ const size_t wxh = w * h;
++ size_t bits_alloc;
++
++ /* Annex A gives a min compression of 2 @ lvl 3.1
++ * (wxh <= 983040) and min 4 thereafter but avoid
++ * the odity of 983041 having a lower limit than
++ * 983040.
++ * Multiply by 3/2 for 4:2:0
++ */
++ bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
++ wxh < 983040 * 2 ? 983040 * 3 / 4 :
++ wxh * 3 / 8;
++ /* Allow for bit depth */
++ bits_alloc += (bits_alloc * bits_minus8) / 8;
++ /* Add a few bytes (16k) for overhead */
++ bits_alloc += 0x4000;
++ return bits_alloc;
++}
++
++static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
++ av_unused const uint8_t *buffer,
++ av_unused uint32_t size)
++{
++ const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->start_frame(avctx, buffer, size);
++}
++
++static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->decode_slice(avctx, buffer, size);
++}
++
++static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
++{
++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->end_frame(avctx);
++}
++
++static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ ctx->fns->abort_frame(avctx);
++}
++
++static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->frame_params(avctx, hw_frames_ctx);
++}
++
++static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ return ctx->fns->alloc_frame(avctx, frame);
++}
++
++
++static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
++{
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ decode_q_wait(&ctx->decode_q, NULL); // Wait for all other threads to be out of decode
++
++ mediabufs_ctl_unref(&ctx->mbufs);
++ media_pool_delete(&ctx->mpool);
++ pollqueue_unref(&ctx->pq);
++ dmabufs_ctl_delete(&ctx->dbufs);
++ devscan_delete(&ctx->devscan);
++
++ decode_q_uninit(&ctx->decode_q);
++
++// if (avctx->hw_frames_ctx) {
++// AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
++// av_buffer_pool_flush(hwfc->pool);
++// }
++ return 0;
++}
++
++static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
++{
++ AVCodecContext *const avctx = v;
++ const HEVCContext *const h = avctx->priv_data;
++
++ if (h->ps.sps->bit_depth == 8) {
++ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
++ fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
++ return 1;
++ }
++ }
++ else if (h->ps.sps->bit_depth == 10) {
++ if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++ return 1;
++ }
++ }
++ return 0;
++}
++
++static int v4l2_request_hevc_init(AVCodecContext *avctx)
++{
++ const HEVCContext *h = avctx->priv_data;
++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++ const HEVCSPS * const sps = h->ps.sps;
++ int ret;
++ const struct decdev * decdev;
++ const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2; // Assuming constant for all APIs but avoiding V4L2 includes
++ size_t src_size;
++
++ av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ // Give up immediately if this is something that we have no code to deal with
++ if (h->ps.sps->chroma_format_idc != 1) {
++ av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
++ return AVERROR_PATCHWELCOME;
++ }
++ if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
++ h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
++ av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
++ return AVERROR_PATCHWELCOME;
++ }
++
++ if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
++ av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
++ return (AVERROR(-ret));
++ }
++ ret = AVERROR(ENOMEM); // Assume mem fail by default for these
++
++ if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
++ {
++ av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
++ ret = AVERROR(ENODEV);
++ goto fail0;
++ }
++ av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
++ decdev_media_path(decdev), decdev_video_path(decdev));
++
++ if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
++ goto fail0;
++ }
++
++ if ((ctx->pq = pollqueue_new()) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
++ goto fail1;
++ }
++
++ if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
++ goto fail2;
++ }
++
++ if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
++ av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
++ goto fail3;
++ }
++
++ // Ask for an initial bitbuf size of max size / 4
++ // We will realloc if we need more
++ // Must use sps->h/w as avctx contains cropped size
++ src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
++ if (mediabufs_src_resizable(ctx->mbufs))
++ src_size /= 4;
++ // Kludge for conformance tests which break Annex A limits
++ else if (src_size < 0x40000)
++ src_size = 0x40000;
++
++ if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
++ sps->width, sps->height, src_size)) {
++ char tbuf1[5];
++ av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
++ goto fail4;
++ }
++
++ if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
++ ctx->fns = &V2(ff_v4l2_req_hevc, 4);
++ }
++ else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
++ ctx->fns = &V2(ff_v4l2_req_hevc, 3);
++ }
++ else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
++ ctx->fns = &V2(ff_v4l2_req_hevc, 2);
++ }
++ else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
++ av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
++ ctx->fns = &V2(ff_v4l2_req_hevc, 1);
++ }
++ else {
++ av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
++ ret = AVERROR(EINVAL);
++ goto fail4;
++ }
++
++ if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
++ char tbuf1[5];
++ av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
++ goto fail4;
++ }
++
++ if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
++ goto fail4;
++ }
++
++ {
++ unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
++ avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
++ av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
++ avctx->thread_count, avctx->extra_hw_frames);
++
++ // extra_hw_frames is -1 if unset
++ if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
++ goto fail4;
++ }
++ }
++
++ if (mediabufs_stream_on(ctx->mbufs)) {
++ av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
++ goto fail4;
++ }
++
++ if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
++ goto fail4;
++ }
++
++ if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
++ goto fail5;
++ }
++
++ decode_q_init(&ctx->decode_q);
++
++ // Set our s/w format
++ avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
++
++ av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
++ ctx->fns->name,
++ decdev_media_path(decdev), decdev_video_path(decdev));
++
++ return 0;
++
++fail5:
++ av_buffer_unref(&avctx->hw_frames_ctx);
++fail4:
++ mediabufs_ctl_unref(&ctx->mbufs);
++fail3:
++ media_pool_delete(&ctx->mpool);
++fail2:
++ pollqueue_unref(&ctx->pq);
++fail1:
++ dmabufs_ctl_delete(&ctx->dbufs);
++fail0:
++ devscan_delete(&ctx->devscan);
++ return ret;
++}
++
++const AVHWAccel ff_hevc_v4l2request_hwaccel = {
++ .name = "hevc_v4l2request",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .id = AV_CODEC_ID_HEVC,
++ .pix_fmt = AV_PIX_FMT_DRM_PRIME,
++ .alloc_frame = v4l2_req_hevc_alloc_frame,
++ .start_frame = v4l2_req_hevc_start_frame,
++ .decode_slice = v4l2_req_hevc_decode_slice,
++ .end_frame = v4l2_req_hevc_end_frame,
++ .abort_frame = v4l2_req_hevc_abort_frame,
++ .init = v4l2_request_hevc_init,
++ .uninit = v4l2_request_hevc_uninit,
++ .priv_data_size = sizeof(V4L2RequestContextHEVC),
++ .frame_params = v4l2_req_hevc_frame_params,
++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.h
+@@ -0,0 +1,101 @@
++#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
++#define AVCODEC_V4L2_REQUEST_HEVC_H
++
++#include <drm_fourcc.h>
++#include "v4l2_req_decode_q.h"
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#include <linux/videodev2.h>
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in drm_fourcc.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
++#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
++#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY 0x0800
++#endif
++
++#define VCAT(name, version) name##_v##version
++#define V2(n,v) VCAT(n, v)
++#define V(n) V2(n, HEVC_CTRLS_VERSION)
++
++#define S2(x) #x
++#define STR(x) S2(x)
++
++// 1 per decoder
++struct v4l2_req_decode_fns;
++
++typedef struct V4L2RequestContextHEVC {
++// V4L2RequestContext base;
++ const struct v4l2_req_decode_fns * fns;
++
++ unsigned int timestamp; // ?? maybe uint64_t
++
++ int decode_mode;
++ int start_code;
++ unsigned int max_slices; // 0 => not wanted (frame mode)
++ unsigned int max_offsets; // 0 => not wanted
++
++ req_decode_q decode_q;
++
++ struct devscan *devscan;
++ struct dmabufs_ctl *dbufs;
++ struct pollqueue *pq;
++ struct media_pool * mpool;
++ struct mediabufs_ctl *mbufs;
++} V4L2RequestContextHEVC;
++
++typedef struct v4l2_req_decode_fns {
++ int src_pix_fmt_v4l2;
++ const char * name;
++
++ // Init setup
++ int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
++ int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
++
++ // Passthrough of hwaccel fns
++ int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
++ int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
++ int (*end_frame)(AVCodecContext *avctx);
++ void (*abort_frame)(AVCodecContext *avctx);
++ int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++ int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
++} v4l2_req_decode_fns;
++
++
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
++
++#endif
+--- a/libavcodec/vc1dec.c
++++ b/libavcodec/vc1dec.c
+@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCod
+ size = next - start - 4;
+ if (size <= 0)
+ continue;
+- buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
++ buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+ init_get_bits(&gb, buf2, buf2_size * 8);
+ switch (AV_RB32(start)) {
+ case VC1_CODE_SEQHDR:
+@@ -689,7 +689,7 @@ static int vc1_decode_frame(AVCodecConte
+ case VC1_CODE_FRAME:
+ if (avctx->hwaccel)
+ buf_start = start;
+- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+ break;
+ case VC1_CODE_FIELD: {
+ int buf_size3;
+@@ -706,8 +706,8 @@ static int vc1_decode_frame(AVCodecConte
+ ret = AVERROR(ENOMEM);
+ goto err;
+ }
+- buf_size3 = vc1_unescape_buffer(start + 4, size,
+- slices[n_slices].buf);
++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++ slices[n_slices].buf);
+ init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+ buf_size3 << 3);
+ slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
+@@ -718,7 +718,7 @@ static int vc1_decode_frame(AVCodecConte
+ break;
+ }
+ case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
+- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+ init_get_bits(&s->gb, buf2, buf_size2 * 8);
+ ff_vc1_decode_entry_point(avctx, v, &s->gb);
+ break;
+@@ -735,8 +735,8 @@ static int vc1_decode_frame(AVCodecConte
+ ret = AVERROR(ENOMEM);
+ goto err;
+ }
+- buf_size3 = vc1_unescape_buffer(start + 4, size,
+- slices[n_slices].buf);
++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++ slices[n_slices].buf);
+ init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+ buf_size3 << 3);
+ slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
+@@ -770,7 +770,7 @@ static int vc1_decode_frame(AVCodecConte
+ ret = AVERROR(ENOMEM);
+ goto err;
+ }
+- buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+ init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+ buf_size3 << 3);
+ slices[n_slices].mby_start = s->mb_height + 1 >> 1;
+@@ -779,9 +779,9 @@ static int vc1_decode_frame(AVCodecConte
+ n_slices1 = n_slices - 1;
+ n_slices++;
+ }
+- buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
+ } else {
+- buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
+ }
+ init_get_bits(&s->gb, buf2, buf_size2*8);
+ } else
+--- a/libavcodec/vc1dsp.c
++++ b/libavcodec/vc1dsp.c
+@@ -32,6 +32,7 @@
+ #include "rnd_avg.h"
+ #include "vc1dsp.h"
+ #include "startcode.h"
++#include "vc1_common.h"
+
+ /* Apply overlap transform to horizontal edge */
+ static void vc1_v_overlap_c(uint8_t *src, int stride)
+@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContex
+ #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+
+ dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
++ dsp->vc1_unescape_buffer = vc1_unescape_buffer;
+
+ if (ARCH_AARCH64)
+ ff_vc1dsp_init_aarch64(dsp);
+--- a/libavcodec/vc1dsp.h
++++ b/libavcodec/vc1dsp.h
+@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
+ * one or more further zero bytes and a one byte.
+ */
+ int (*startcode_find_candidate)(const uint8_t *buf, int size);
++
++ /* Copy a buffer, removing startcode emulation escape bytes as we go */
++ int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
+ } VC1DSPContext;
+
+ void ff_vc1dsp_init(VC1DSPContext* c);
+--- /dev/null
++++ b/libavcodec/weak_link.c
+@@ -0,0 +1,102 @@
++#include <stdlib.h>
++#include <pthread.h>
++#include <stdatomic.h>
++#include "weak_link.h"
++
++struct ff_weak_link_master {
++ atomic_int ref_count; /* 0 is single ref for easier atomics */
++ pthread_rwlock_t lock;
++ void * ptr;
++};
++
++static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
++{
++ return (struct ff_weak_link_master *)c;
++}
++
++struct ff_weak_link_master * ff_weak_link_new(void * p)
++{
++ struct ff_weak_link_master * w = malloc(sizeof(*w));
++ if (!w)
++ return NULL;
++ w->ptr = p;
++ if (pthread_rwlock_init(&w->lock, NULL)) {
++ free(w);
++ return NULL;
++ }
++ return w;
++}
++
++static void weak_link_do_unref(struct ff_weak_link_master * const w)
++{
++ int n = atomic_fetch_sub(&w->ref_count, 1);
++ if (n)
++ return;
++
++ pthread_rwlock_destroy(&w->lock);
++ free(w);
++}
++
++// Unref & break link
++void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
++{
++ struct ff_weak_link_master * const w = *ppLink;
++ if (!w)
++ return;
++
++ *ppLink = NULL;
++ pthread_rwlock_wrlock(&w->lock);
++ w->ptr = NULL;
++ pthread_rwlock_unlock(&w->lock);
++
++ weak_link_do_unref(w);
++}
++
++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
++{
++ if (!w)
++ return NULL;
++ atomic_fetch_add(&w->ref_count, 1);
++ return (struct ff_weak_link_client*)w;
++}
++
++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
++{
++ struct ff_weak_link_master * const w = weak_link_x(*ppLink);
++ if (!w)
++ return;
++
++ *ppLink = NULL;
++ weak_link_do_unref(w);
++}
++
++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
++{
++ struct ff_weak_link_master * const w = weak_link_x(*ppLink);
++
++ if (!w)
++ return NULL;
++
++ if (pthread_rwlock_rdlock(&w->lock))
++ goto broken;
++
++ if (w->ptr)
++ return w->ptr;
++
++ pthread_rwlock_unlock(&w->lock);
++
++broken:
++ *ppLink = NULL;
++ weak_link_do_unref(w);
++ return NULL;
++}
++
++// Ignores a NULL c (so can be on the return path of both broken & live links)
++void ff_weak_link_unlock(struct ff_weak_link_client * c)
++{
++ struct ff_weak_link_master * const w = weak_link_x(c);
++ if (w)
++ pthread_rwlock_unlock(&w->lock);
++}
++
++
+--- /dev/null
++++ b/libavcodec/weak_link.h
+@@ -0,0 +1,23 @@
++struct ff_weak_link_master;
++struct ff_weak_link_client;
++
++struct ff_weak_link_master * ff_weak_link_new(void * p);
++void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
++
++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
++
++// Returns NULL if link broken - in this case it will also zap
++// *ppLink and unref the weak_link.
++// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
++//
++// The above does mean that there is a race if this is called simultainiously
++// by two threads using the same weak_link_client (so don't do that)
++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
++void ff_weak_link_unlock(struct ff_weak_link_client * c);
++
++
++
++
++
++
+--- a/libavdevice/Makefile
++++ b/libavdevice/Makefile
+@@ -46,6 +46,9 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)
+ OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o
+ OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o
+ OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o
++OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o
++OBJS-$(CONFIG_VOUT_EGL_OUTDEV) += egl_vout.o
++OBJS-$(CONFIG_VOUT_RPI_OUTDEV) += rpi_vout.o
+ OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o
+ OBJS-$(CONFIG_XV_OUTDEV) += xv.o
+
+--- a/libavdevice/alldevices.c
++++ b/libavdevice/alldevices.c
+@@ -52,6 +52,9 @@ extern AVOutputFormat ff_sndio_muxer;
+ extern AVInputFormat ff_v4l2_demuxer;
+ extern AVOutputFormat ff_v4l2_muxer;
+ extern AVInputFormat ff_vfwcap_demuxer;
++extern AVOutputFormat ff_vout_drm_muxer;
++extern AVOutputFormat ff_vout_egl_muxer;
++extern AVOutputFormat ff_vout_rpi_muxer;
+ extern AVInputFormat ff_xcbgrab_demuxer;
+ extern AVOutputFormat ff_xv_muxer;
+
+--- /dev/null
++++ b/libavdevice/drm_vout.c
+@@ -0,0 +1,643 @@
++/*
++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++// *** This module is a work in progress and its utility is strictly
++// limited to testing.
++
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include "pthread.h"
++#include <semaphore.h>
++#include <unistd.h>
++
++#include <xf86drm.h>
++#include <xf86drmMode.h>
++
++#define TRACE_ALL 0
++
++#define DRM_MODULE "vc4"
++
++#define ERRSTR strerror(errno)
++
++struct drm_setup {
++ int conId;
++ uint32_t crtcId;
++ int crtcIdx;
++ uint32_t planeId;
++ unsigned int out_fourcc;
++ struct {
++ int x, y, width, height;
++ } compose;
++};
++
++typedef struct drm_aux_s {
++ unsigned int fb_handle;
++ uint32_t bo_handles[AV_DRM_MAX_PLANES];
++ AVFrame * frame;
++} drm_aux_t;
++
++// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
++// we get initial flicker probably due to dodgy drm timing
++#define AUX_SIZE 3
++typedef struct drm_display_env_s
++{
++ AVClass *class;
++
++ int drm_fd;
++ uint32_t con_id;
++ struct drm_setup setup;
++ enum AVPixelFormat avfmt;
++ int show_all;
++
++ unsigned int ano;
++ drm_aux_t aux[AUX_SIZE];
++
++ pthread_t q_thread;
++ sem_t q_sem_in;
++ sem_t q_sem_out;
++ int q_terminate;
++ AVFrame * q_next;
++
++} drm_display_env_t;
++
++
++static int drm_vout_write_trailer(AVFormatContext *s)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++
++ return 0;
++}
++
++static int drm_vout_write_header(AVFormatContext *s)
++{
++ const AVCodecParameters * const par = s->streams[0]->codecpar;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++ if ( s->nb_streams > 1
++ || par->codec_type != AVMEDIA_TYPE_VIDEO
++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) {
++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++ return AVERROR(EINVAL);
++ }
++
++ return 0;
++}
++
++static int find_plane(struct AVFormatContext * const avctx,
++ const int drmfd, const int crtcidx, const uint32_t format,
++ uint32_t * const pplane_id)
++{
++ drmModePlaneResPtr planes;
++ drmModePlanePtr plane;
++ unsigned int i;
++ unsigned int j;
++ int ret = 0;
++
++ planes = drmModeGetPlaneResources(drmfd);
++ if (!planes)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
++ return -1;
++ }
++
++ for (i = 0; i < planes->count_planes; ++i) {
++ plane = drmModeGetPlane(drmfd, planes->planes[i]);
++ if (!planes)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
++ break;
++ }
++
++ if (!(plane->possible_crtcs & (1 << crtcidx))) {
++ drmModeFreePlane(plane);
++ continue;
++ }
++
++ for (j = 0; j < plane->count_formats; ++j) {
++ if (plane->formats[j] == format)
++ break;
++ }
++
++ if (j == plane->count_formats) {
++ drmModeFreePlane(plane);
++ continue;
++ }
++
++ *pplane_id = plane->plane_id;
++ drmModeFreePlane(plane);
++ break;
++ }
++
++ if (i == planes->count_planes)
++ ret = -1;
++
++ drmModeFreePlaneResources(planes);
++ return ret;
++}
++
++static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
++{
++ if (da->fb_handle != 0) {
++ drmModeRmFB(de->drm_fd, da->fb_handle);
++ da->fb_handle = 0;
++ }
++
++ for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
++ if (da->bo_handles[i]) {
++ struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
++ drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
++ da->bo_handles[i] = 0;
++ }
++ }
++ av_frame_free(&da->frame);
++}
++
++static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
++{
++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
++ drm_aux_t * da = de->aux + de->ano;
++ const uint32_t format = desc->layers[0].format;
++ int ret = 0;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
++#endif
++
++ if (de->setup.out_fourcc != format) {
++ if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
++ av_frame_free(&frame);
++ av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
++ return -1;
++ }
++ de->setup.out_fourcc = format;
++ }
++
++ {
++ drmVBlank vbl = {
++ .request = {
++ .type = DRM_VBLANK_RELATIVE,
++ .sequence = 0
++ }
++ };
++
++ while (drmWaitVBlank(de->drm_fd, &vbl)) {
++ if (errno != EINTR) {
++// av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
++ break;
++ }
++ }
++ }
++
++ da_uninit(de, da);
++
++ {
++ uint32_t pitches[4] = {0};
++ uint32_t offsets[4] = {0};
++ uint64_t modifiers[4] = {0};
++ uint32_t bo_handles[4] = {0};
++ int i, j, n;
++
++ da->frame = frame;
++
++ for (i = 0; i < desc->nb_objects; ++i) {
++ if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
++ av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
++ return -1;
++ }
++ }
++
++ n = 0;
++ for (i = 0; i < desc->nb_layers; ++i) {
++ for (j = 0; j < desc->layers[i].nb_planes; ++j) {
++ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
++ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
++ pitches[n] = p->pitch;
++ offsets[n] = p->offset;
++ modifiers[n] = obj->format_modifier;
++ bo_handles[n] = da->bo_handles[p->object_index];
++ ++n;
++ }
++ }
++
++#if 1 && TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
++ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
++ av_frame_cropped_width(frame),
++ av_frame_cropped_height(frame),
++ desc->layers[0].format,
++ bo_handles[0],
++ bo_handles[1],
++ bo_handles[2],
++ bo_handles[3],
++ pitches[0],
++ pitches[1],
++ pitches[2],
++ pitches[3],
++ offsets[0],
++ offsets[1],
++ offsets[2],
++ offsets[3],
++ (long long)modifiers[0],
++ (long long)modifiers[1],
++ (long long)modifiers[2],
++ (long long)modifiers[3]
++ );
++#endif
++
++ if (drmModeAddFB2WithModifiers(de->drm_fd,
++ av_frame_cropped_width(frame),
++ av_frame_cropped_height(frame),
++ desc->layers[0].format, bo_handles,
++ pitches, offsets, modifiers,
++ &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
++ av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
++ return -1;
++ }
++ }
++
++ ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
++ da->fb_handle, 0,
++ de->setup.compose.x, de->setup.compose.y,
++ de->setup.compose.width,
++ de->setup.compose.height,
++ 0, 0,
++ av_frame_cropped_width(frame) << 16,
++ av_frame_cropped_height(frame) << 16);
++
++ if (ret != 0) {
++ av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
++ }
++
++ de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
++
++ return ret;
++}
++
++static int do_sem_wait(sem_t * const sem, const int nowait)
++{
++ while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
++ if (errno != EINTR)
++ return -errno;
++ }
++ return 0;
++}
++
++static void * display_thread(void * v)
++{
++ AVFormatContext * const s = v;
++ drm_display_env_t * const de = s->priv_data;
++ int i;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++#endif
++
++ sem_post(&de->q_sem_out);
++
++ for (;;) {
++ AVFrame * frame;
++
++ do_sem_wait(&de->q_sem_in, 0);
++
++ if (de->q_terminate)
++ break;
++
++ frame = de->q_next;
++ de->q_next = NULL;
++ sem_post(&de->q_sem_out);
++
++ do_display(s, de, frame);
++ }
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++#endif
++
++ for (i = 0; i != AUX_SIZE; ++i)
++ da_uninit(de, de->aux + i);
++
++ av_frame_free(&de->q_next);
++
++ return NULL;
++}
++
++static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++ const AVFrame * const src_frame = (AVFrame *)pkt->data;
++ AVFrame * frame;
++ drm_display_env_t * const de = s->priv_data;
++ int ret;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++
++ if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
++ av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
++ return 0;
++ }
++
++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
++ frame = av_frame_alloc();
++ av_frame_ref(frame, src_frame);
++ }
++ else if (src_frame->format == AV_PIX_FMT_VAAPI) {
++ frame = av_frame_alloc();
++ frame->format = AV_PIX_FMT_DRM_PRIME;
++ if (av_hwframe_map(frame, src_frame, 0) != 0)
++ {
++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
++ av_frame_free(&frame);
++ return AVERROR(EINVAL);
++ }
++ }
++ else {
++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
++ return AVERROR(EINVAL);
++ }
++
++ ret = do_sem_wait(&de->q_sem_out, !de->show_all);
++ if (ret) {
++ av_frame_free(&frame);
++ }
++ else {
++ de->q_next = frame;
++ sem_post(&de->q_sem_in);
++ }
++
++ return 0;
++}
++
++static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++ unsigned flags)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++ /* drm_vout_write_header() should have accepted only supported formats */
++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++ return 0;
++
++ return 0;
++}
++
++static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
++#endif
++ switch(type) {
++ case AV_APP_TO_DEV_WINDOW_REPAINT:
++ return 0;
++ default:
++ break;
++ }
++ return AVERROR(ENOSYS);
++}
++
++static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
++{
++ int ret = -1;
++ int i;
++ drmModeRes *res = drmModeGetResources(drmfd);
++ drmModeConnector *c;
++
++ if(!res)
++ {
++ printf( "drmModeGetResources failed: %s\n", ERRSTR);
++ return -1;
++ }
++
++ if (res->count_crtcs <= 0)
++ {
++ printf( "drm: no crts\n");
++ goto fail_res;
++ }
++
++ if (!s->conId) {
++ fprintf(stderr,
++ "No connector ID specified. Choosing default from list:\n");
++
++ for (i = 0; i < res->count_connectors; i++) {
++ drmModeConnector *con =
++ drmModeGetConnector(drmfd, res->connectors[i]);
++ drmModeEncoder *enc = NULL;
++ drmModeCrtc *crtc = NULL;
++
++ if (con->encoder_id) {
++ enc = drmModeGetEncoder(drmfd, con->encoder_id);
++ if (enc->crtc_id) {
++ crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
++ }
++ }
++
++ if (!s->conId && crtc) {
++ s->conId = con->connector_id;
++ s->crtcId = crtc->crtc_id;
++ }
++
++ av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
++ con->connector_id,
++ crtc ? crtc->crtc_id : 0,
++ con->connector_type,
++ crtc ? crtc->width : 0,
++ crtc ? crtc->height : 0,
++ (s->conId == (int)con->connector_id ?
++ " (chosen)" : ""));
++ }
++
++ if (!s->conId) {
++ av_log(avctx, AV_LOG_ERROR,
++ "No suitable enabled connector found.\n");
++ return -1;;
++ }
++ }
++
++ s->crtcIdx = -1;
++
++ for (i = 0; i < res->count_crtcs; ++i) {
++ if (s->crtcId == res->crtcs[i]) {
++ s->crtcIdx = i;
++ break;
++ }
++ }
++
++ if (s->crtcIdx == -1)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
++ goto fail_res;
++ }
++
++ if (res->count_connectors <= 0)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
++ goto fail_res;
++ }
++
++ c = drmModeGetConnector(drmfd, s->conId);
++ if (!c)
++ {
++ av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
++ goto fail_res;
++ }
++
++ if (!c->count_modes)
++ {
++ av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
++ goto fail_conn;
++ }
++
++ {
++ drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
++ s->compose.x = crtc->x;
++ s->compose.y = crtc->y;
++ s->compose.width = crtc->width;
++ s->compose.height = crtc->height;
++ drmModeFreeCrtc(crtc);
++ }
++
++ if (pConId)
++ *pConId = c->connector_id;
++ ret = 0;
++
++fail_conn:
++ drmModeFreeConnector(c);
++
++fail_res:
++ drmModeFreeResources(res);
++
++ return ret;
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int drm_vout_init(struct AVFormatContext * s)
++{
++ drm_display_env_t * const de = s->priv_data;
++ int rv;
++ const char * drm_module = DRM_MODULE;
++
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ de->drm_fd = -1;
++ de->con_id = 0;
++ de->setup = (struct drm_setup){0};
++ de->q_terminate = 0;
++
++ if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
++ {
++ rv = AVERROR(errno);
++ av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
++ return rv;
++ }
++
++ if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
++ {
++ av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
++ rv = AVERROR(EINVAL);
++ goto fail_close;
++ }
++
++ sem_init(&de->q_sem_in, 0, 0);
++ sem_init(&de->q_sem_out, 0, 0);
++ if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
++ rv = AVERROR(errno);
++ av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
++ goto fail_close;
++ }
++
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++
++ return 0;
++
++fail_close:
++ close(de->drm_fd);
++ de->drm_fd = -1;
++ av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
++
++ return rv;
++}
++
++static void drm_vout_deinit(struct AVFormatContext * s)
++{
++ drm_display_env_t * const de = s->priv_data;
++
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ de->q_terminate = 1;
++ sem_post(&de->q_sem_in);
++ pthread_join(de->q_thread, NULL);
++ sem_destroy(&de->q_sem_in);
++ sem_destroy(&de->q_sem_out);
++
++ for (unsigned int i = 0; i != AUX_SIZE; ++i)
++ da_uninit(de, de->aux + i);
++
++ av_frame_free(&de->q_next);
++
++ if (de->drm_fd >= 0) {
++ close(de->drm_fd);
++ de->drm_fd = -1;
++ }
++
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++}
++
++
++#define OFFSET(x) offsetof(drm_display_env_t, x)
++static const AVOption options[] = {
++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { NULL }
++};
++
++static const AVClass drm_vout_class = {
++ .class_name = "drm vid outdev",
++ .item_name = av_default_item_name,
++ .option = options,
++ .version = LIBAVUTIL_VERSION_INT,
++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_drm_muxer = {
++ .name = "vout_drm",
++ .long_name = NULL_IF_CONFIG_SMALL("Drm video output device"),
++ .priv_data_size = sizeof(drm_display_env_t),
++ .audio_codec = AV_CODEC_ID_NONE,
++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME,
++ .write_header = drm_vout_write_header,
++ .write_packet = drm_vout_write_packet,
++ .write_uncoded_frame = drm_vout_write_frame,
++ .write_trailer = drm_vout_write_trailer,
++ .control_message = drm_vout_control_message,
++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++ .priv_class = &drm_vout_class,
++ .init = drm_vout_init,
++ .deinit = drm_vout_deinit,
++};
++
+--- /dev/null
++++ b/libavdevice/egl_vout.c
+@@ -0,0 +1,816 @@
++/*
++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++// *** This module is a work in progress and its utility is strictly
++// limited to testing.
++// Amongst other issues it doesn't wait for the pic to be displayed before
++// returning the buffer so flikering does occur.
++
++#include <epoxy/gl.h>
++#include <epoxy/egl.h>
++
++#include "libavutil/opt.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/imgutils.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include "pthread.h"
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <unistd.h>
++
++#include <X11/Xlib.h>
++#include <X11/Xutil.h>
++
++#include "libavutil/rpi_sand_fns.h"
++
++#define TRACE_ALL 0
++
++struct egl_setup {
++ int conId;
++
++ Display *dpy;
++ EGLDisplay egl_dpy;
++ EGLContext ctx;
++ EGLSurface surf;
++ Window win;
++
++ uint32_t crtcId;
++ int crtcIdx;
++ uint32_t planeId;
++ struct {
++ int x, y, width, height;
++ } compose;
++};
++
++typedef struct egl_aux_s {
++ int fd;
++ GLuint texture;
++
++} egl_aux_t;
++
++typedef struct egl_display_env_s
++{
++ AVClass *class;
++
++ struct egl_setup setup;
++ enum AVPixelFormat avfmt;
++
++ int show_all;
++ int window_width, window_height;
++ int window_x, window_y;
++ int fullscreen;
++
++ egl_aux_t aux[32];
++
++ pthread_t q_thread;
++ pthread_mutex_t q_lock;
++ sem_t display_start_sem;
++ sem_t q_sem;
++ int q_terminate;
++ AVFrame * q_this;
++ AVFrame * q_next;
++
++} egl_display_env_t;
++
++
++/**
++ * Remove window border/decorations.
++ */
++static void
++no_border( Display *dpy, Window w)
++{
++ static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
++ static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
++
++ typedef struct
++ {
++ unsigned long flags;
++ unsigned long functions;
++ unsigned long decorations;
++ long inputMode;
++ unsigned long status;
++ } PropMotifWmHints;
++
++ PropMotifWmHints motif_hints;
++ Atom prop, proptype;
++ unsigned long flags = 0;
++
++ /* setup the property */
++ motif_hints.flags = MWM_HINTS_DECORATIONS;
++ motif_hints.decorations = flags;
++
++ /* get the atom for the property */
++ prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
++ if (!prop) {
++ /* something went wrong! */
++ return;
++ }
++
++ /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
++ proptype = prop;
++
++ XChangeProperty( dpy, w, /* display, window */
++ prop, proptype, /* property, type */
++ 32, /* format: 32-bit datums */
++ PropModeReplace, /* mode */
++ (unsigned char *) &motif_hints, /* data */
++ PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */
++ );
++}
++
++
++/*
++ * Create an RGB, double-buffered window.
++ * Return the window and context handles.
++ */
++static int
++make_window(struct AVFormatContext * const s,
++ egl_display_env_t * const de,
++ Display *dpy, EGLDisplay egl_dpy, const char *name,
++ Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
++{
++ int scrnum = DefaultScreen( dpy );
++ XSetWindowAttributes attr;
++ unsigned long mask;
++ Window root = RootWindow( dpy, scrnum );
++ Window win;
++ EGLContext ctx;
++ const int fullscreen = de->fullscreen;
++ EGLConfig config;
++ int x = de->window_x;
++ int y = de->window_y;
++ int width = de->window_width ? de->window_width : 1280;
++ int height = de->window_height ? de->window_height : 720;
++
++
++ if (fullscreen) {
++ int scrnum = DefaultScreen(dpy);
++
++ x = 0; y = 0;
++ width = DisplayWidth(dpy, scrnum);
++ height = DisplayHeight(dpy, scrnum);
++ }
++
++ {
++ EGLint num_configs;
++ static const EGLint attribs[] = {
++ EGL_RED_SIZE, 1,
++ EGL_GREEN_SIZE, 1,
++ EGL_BLUE_SIZE, 1,
++ EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
++ EGL_NONE
++ };
++
++ if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
++ av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
++ return -1;
++ }
++ }
++
++ {
++ EGLint vid;
++ if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
++ av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
++ return -1;
++ }
++
++ {
++ XVisualInfo visTemplate = {
++ .visualid = vid,
++ };
++ int num_visuals;
++ XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
++ &visTemplate, &num_visuals);
++
++ /* window attributes */
++ attr.background_pixel = 0;
++ attr.border_pixel = 0;
++ attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
++ attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
++ /* XXX this is a bad way to get a borderless window! */
++ mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
++
++ win = XCreateWindow( dpy, root, x, y, width, height,
++ 0, visinfo->depth, InputOutput,
++ visinfo->visual, mask, &attr );
++ XFree(visinfo);
++ }
++ }
++
++ if (fullscreen)
++ no_border(dpy, win);
++
++ /* set hints and properties */
++ {
++ XSizeHints sizehints;
++ sizehints.x = x;
++ sizehints.y = y;
++ sizehints.width = width;
++ sizehints.height = height;
++ sizehints.flags = USSize | USPosition;
++ XSetNormalHints(dpy, win, &sizehints);
++ XSetStandardProperties(dpy, win, name, name,
++ None, (char **)NULL, 0, &sizehints);
++ }
++
++ eglBindAPI(EGL_OPENGL_ES_API);
++
++ {
++ static const EGLint ctx_attribs[] = {
++ EGL_CONTEXT_CLIENT_VERSION, 2,
++ EGL_NONE
++ };
++ ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
++ if (!ctx) {
++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
++ return -1;
++ }
++ }
++
++
++ XMapWindow(dpy, win);
++
++ {
++ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
++ if (!surf) {
++ av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
++ return -1;
++ }
++
++ if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
++ return -1;
++ }
++
++ *winRet = win;
++ *ctxRet = ctx;
++ *surfRet = surf;
++ }
++
++ return 0;
++}
++
++static GLint
++compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
++{
++ GLuint s = glCreateShader(target);
++
++ if (s == 0) {
++ av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
++ return 0;
++ }
++
++ glShaderSource(s, 1, (const GLchar **) &source, NULL);
++ glCompileShader(s);
++
++ {
++ GLint ok;
++ glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
++
++ if (!ok) {
++ GLchar *info;
++ GLint size;
++
++ glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
++ info = malloc(size);
++
++ glGetShaderInfoLog(s, size, NULL, info);
++ av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
++
++ return 0;
++ }
++ }
++
++ return s;
++}
++
++static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
++{
++ GLuint prog = glCreateProgram();
++
++ if (prog == 0) {
++ av_log(s, AV_LOG_ERROR, "Failed to create program\n");
++ return 0;
++ }
++
++ glAttachShader(prog, vs);
++ glAttachShader(prog, fs);
++ glLinkProgram(prog);
++
++ {
++ GLint ok;
++ glGetProgramiv(prog, GL_LINK_STATUS, &ok);
++ if (!ok) {
++ /* Some drivers return a size of 1 for an empty log. This is the size
++ * of a log that contains only a terminating NUL character.
++ */
++ GLint size;
++ GLchar *info = NULL;
++ glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
++ if (size > 1) {
++ info = malloc(size);
++ glGetProgramInfoLog(prog, size, NULL, info);
++ }
++
++ av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
++ (info != NULL) ? info : "<empty log>");
++ return 0;
++ }
++ }
++
++ return prog;
++}
++
++static int
++gl_setup(struct AVFormatContext * const s)
++{
++ const char *vs =
++ "attribute vec4 pos;\n"
++ "varying vec2 texcoord;\n"
++ "\n"
++ "void main() {\n"
++ " gl_Position = pos;\n"
++ " texcoord.x = (pos.x + 1.0) / 2.0;\n"
++ " texcoord.y = (-pos.y + 1.0) / 2.0;\n"
++ "}\n";
++ const char *fs =
++ "#extension GL_OES_EGL_image_external : enable\n"
++ "precision mediump float;\n"
++ "uniform samplerExternalOES s;\n"
++ "varying vec2 texcoord;\n"
++ "void main() {\n"
++ " gl_FragColor = texture2D(s, texcoord);\n"
++ "}\n";
++
++ GLuint vs_s;
++ GLuint fs_s;
++ GLuint prog;
++
++ if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
++ !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
++ !(prog = link_program(s, vs_s, fs_s)))
++ return -1;
++
++ glUseProgram(prog);
++
++ {
++ static const float verts[] = {
++ -1, -1,
++ 1, -1,
++ 1, 1,
++ -1, 1,
++ };
++ glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
++ }
++
++ glEnableVertexAttribArray(0);
++ return 0;
++}
++
++static int egl_vout_write_trailer(AVFormatContext *s)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++ return 0;
++}
++
++static int egl_vout_write_header(AVFormatContext *s)
++{
++ const AVCodecParameters * const par = s->streams[0]->codecpar;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++ if ( s->nb_streams > 1
++ || par->codec_type != AVMEDIA_TYPE_VIDEO
++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) {
++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++ return AVERROR(EINVAL);
++ }
++
++ return 0;
++}
++
++
++static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
++{
++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
++ egl_aux_t * da = NULL;
++ unsigned int i;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++
++ for (i = 0; i != 32; ++i) {
++ if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
++ da = de->aux + i;
++ break;
++ }
++ }
++
++ if (da == NULL) {
++ av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
++ return AVERROR(EINVAL);
++ }
++
++ if (da->texture == 0) {
++ EGLint attribs[50];
++ EGLint * a = attribs;
++ int i, j;
++ static const EGLint anames[] = {
++ EGL_DMA_BUF_PLANE0_FD_EXT,
++ EGL_DMA_BUF_PLANE0_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE0_PITCH_EXT,
++ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
++ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
++ EGL_DMA_BUF_PLANE1_FD_EXT,
++ EGL_DMA_BUF_PLANE1_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE1_PITCH_EXT,
++ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
++ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
++ EGL_DMA_BUF_PLANE2_FD_EXT,
++ EGL_DMA_BUF_PLANE2_OFFSET_EXT,
++ EGL_DMA_BUF_PLANE2_PITCH_EXT,
++ EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
++ EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
++ };
++ const EGLint * b = anames;
++
++ *a++ = EGL_WIDTH;
++ *a++ = av_frame_cropped_width(frame);
++ *a++ = EGL_HEIGHT;
++ *a++ = av_frame_cropped_height(frame);
++ *a++ = EGL_LINUX_DRM_FOURCC_EXT;
++ *a++ = desc->layers[0].format;
++
++ for (i = 0; i < desc->nb_layers; ++i) {
++ for (j = 0; j < desc->layers[i].nb_planes; ++j) {
++ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
++ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
++ *a++ = *b++;
++ *a++ = obj->fd;
++ *a++ = *b++;
++ *a++ = p->offset;
++ *a++ = *b++;
++ *a++ = p->pitch;
++ if (obj->format_modifier == 0) {
++ b += 2;
++ }
++ else {
++ *a++ = *b++;
++ *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
++ *a++ = *b++;
++ *a++ = (EGLint)(obj->format_modifier >> 32);
++ }
++ }
++ }
++
++ *a = EGL_NONE;
++
++#if TRACE_ALL
++ for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
++ av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
++ }
++#endif
++ {
++ const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
++ EGL_NO_CONTEXT,
++ EGL_LINUX_DMA_BUF_EXT,
++ NULL, attribs);
++ if (!image) {
++ av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
++ return -1;
++ }
++
++ glGenTextures(1, &da->texture);
++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
++ glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
++
++ eglDestroyImageKHR(de->setup.egl_dpy, image);
++ }
++
++ da->fd = desc->objects[0].fd;
++
++#if 0
++ av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
++ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
++ av_frame_cropped_width(frame),
++ av_frame_cropped_height(frame),
++ desc->layers[0].format,
++ bo_plane_handles[0],
++ bo_plane_handles[1],
++ bo_plane_handles[2],
++ bo_plane_handles[3],
++ pitches[0],
++ pitches[1],
++ pitches[2],
++ pitches[3],
++ offsets[0],
++ offsets[1],
++ offsets[2],
++ offsets[3],
++ (long long)modifiers[0],
++ (long long)modifiers[1],
++ (long long)modifiers[2],
++ (long long)modifiers[3]
++ );
++#endif
++ }
++
++ glClearColor(0.5, 0.5, 0.5, 0.5);
++ glClear(GL_COLOR_BUFFER_BIT);
++
++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
++ glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
++ eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
++
++ glDeleteTextures(1, &da->texture);
++ da->texture = 0;
++ da->fd = -1;
++
++ return 0;
++}
++
++static void * display_thread(void * v)
++{
++ AVFormatContext * const s = v;
++ egl_display_env_t * const de = s->priv_data;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++ {
++ EGLint egl_major, egl_minor;
++
++ de->setup.dpy = XOpenDisplay(NULL);
++ if (!de->setup.dpy) {
++ av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
++ goto fail;
++ }
++
++ de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
++ if (!de->setup.egl_dpy) {
++ av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
++ goto fail;
++ }
++
++ if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
++ av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
++ goto fail;
++ }
++
++ av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
++
++ if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
++ av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
++ goto fail;
++ }
++ }
++
++ if (!de->window_width || !de->window_height) {
++ de->window_width = 1280;
++ de->window_height = 720;
++ }
++ if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
++ &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
++ av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
++ goto fail;
++ }
++
++ if (gl_setup(s)) {
++ av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
++ goto fail;
++ }
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
++#endif
++ sem_post(&de->display_start_sem);
++
++ for (;;) {
++ AVFrame * frame;
++
++ while (sem_wait(&de->q_sem) != 0) {
++ av_assert0(errno == EINTR);
++ }
++
++ if (de->q_terminate)
++ break;
++
++ pthread_mutex_lock(&de->q_lock);
++ frame = de->q_next;
++ de->q_next = NULL;
++ pthread_mutex_unlock(&de->q_lock);
++
++ do_display(s, de, frame);
++
++ av_frame_free(&de->q_this);
++ de->q_this = frame;
++ }
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
++#endif
++
++ return NULL;
++
++fail:
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
++#endif
++ de->q_terminate = 1;
++ sem_post(&de->display_start_sem);
++
++ return NULL;
++}
++
++static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++ const AVFrame * const src_frame = (AVFrame *)pkt->data;
++ AVFrame * frame;
++ egl_display_env_t * const de = s->priv_data;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
++ frame = av_frame_alloc();
++ av_frame_ref(frame, src_frame);
++ }
++ else if (src_frame->format == AV_PIX_FMT_VAAPI) {
++ frame = av_frame_alloc();
++ frame->format = AV_PIX_FMT_DRM_PRIME;
++ if (av_hwframe_map(frame, src_frame, 0) != 0)
++ {
++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
++ av_frame_free(&frame);
++ return AVERROR(EINVAL);
++ }
++ }
++ else {
++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
++ return AVERROR(EINVAL);
++ }
++
++ // Really hacky sync
++ while (de->show_all && de->q_next) {
++ usleep(3000);
++ }
++
++ pthread_mutex_lock(&de->q_lock);
++ {
++ AVFrame * const t = de->q_next;
++ de->q_next = frame;
++ frame = t;
++ }
++ pthread_mutex_unlock(&de->q_lock);
++
++ if (frame == NULL)
++ sem_post(&de->q_sem);
++ else
++ av_frame_free(&frame);
++
++ return 0;
++}
++
++static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++ unsigned flags)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++ /* egl_vout_write_header() should have accepted only supported formats */
++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++ return 0;
++
++ return 0;
++}
++
++static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
++#endif
++ switch(type) {
++ case AV_APP_TO_DEV_WINDOW_REPAINT:
++ return 0;
++ default:
++ break;
++ }
++ return AVERROR(ENOSYS);
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int egl_vout_init(struct AVFormatContext * s)
++{
++ egl_display_env_t * const de = s->priv_data;
++ unsigned int i;
++
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ de->setup = (struct egl_setup){0};
++
++ for (i = 0; i != 32; ++i) {
++ de->aux[i].fd = -1;
++ }
++
++ de->q_terminate = 0;
++ pthread_mutex_init(&de->q_lock, NULL);
++ sem_init(&de->q_sem, 0, 0);
++ sem_init(&de->display_start_sem, 0, 0);
++ av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
++
++ sem_wait(&de->display_start_sem);
++ if (de->q_terminate) {
++ av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
++ return -1;
++ }
++
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++
++ return 0;
++}
++
++static void egl_vout_deinit(struct AVFormatContext * s)
++{
++ egl_display_env_t * const de = s->priv_data;
++
++ av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++ de->q_terminate = 1;
++ sem_post(&de->q_sem);
++ pthread_join(de->q_thread, NULL);
++ sem_destroy(&de->q_sem);
++ pthread_mutex_destroy(&de->q_lock);
++
++ av_frame_free(&de->q_next);
++ av_frame_free(&de->q_this);
++
++ av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++}
++
++#define OFFSET(x) offsetof(egl_display_env_t, x)
++static const AVOption options[] = {
++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { NULL }
++
++};
++
++static const AVClass egl_vout_class = {
++ .class_name = "egl vid outdev",
++ .item_name = av_default_item_name,
++ .option = options,
++ .version = LIBAVUTIL_VERSION_INT,
++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_egl_muxer = {
++ .name = "vout_egl",
++ .long_name = NULL_IF_CONFIG_SMALL("Egl video output device"),
++ .priv_data_size = sizeof(egl_display_env_t),
++ .audio_codec = AV_CODEC_ID_NONE,
++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME,
++ .write_header = egl_vout_write_header,
++ .write_packet = egl_vout_write_packet,
++ .write_uncoded_frame = egl_vout_write_frame,
++ .write_trailer = egl_vout_write_trailer,
++ .control_message = egl_vout_control_message,
++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++ .priv_class = &egl_vout_class,
++ .init = egl_vout_init,
++ .deinit = egl_vout_deinit,
++};
++
+--- /dev/null
++++ b/libavdevice/rpi_vout.c
+@@ -0,0 +1,534 @@
++/*
++ * Copyright (c) 2013 Jeff Moguillansky
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * XVideo output device
++ *
++ * TODO:
++ * - add support to more formats
++ */
++
++#include "libavutil/opt.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/imgutils.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include <stdatomic.h>
++#include <unistd.h>
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <bcm_host.h>
++#include <interface/mmal/mmal.h>
++#include <interface/mmal/mmal_parameters_camera.h>
++#include <interface/mmal/mmal_buffer.h>
++#include <interface/mmal/mmal_port.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++#include <interface/mmal/util/mmal_connection.h>
++#include <interface/mmal/util/mmal_util_params.h>
++#pragma GCC diagnostic pop
++#include "libavutil/rpi_sand_fns.h"
++#include "libavcodec/rpi_zc.h"
++
++#define TRACE_ALL 0
++
++#define DISPLAY_PORT_DEPTH 4
++
++typedef struct rpi_display_env_s
++{
++ AVClass *class;
++
++ MMAL_COMPONENT_T* display;
++ MMAL_COMPONENT_T* isp;
++ MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup
++ MMAL_CONNECTION_T * conn;
++
++ MMAL_POOL_T *rpi_pool;
++ volatile int rpi_display_count;
++
++ MMAL_FOURCC_T req_fmt;
++ MMAL_VIDEO_FORMAT_T req_vfmt;
++
++ AVZcEnvPtr zc;
++
++ int window_width, window_height;
++ int window_x, window_y;
++ int layer, fullscreen;
++ int show_all;
++} rpi_display_env_t;
++
++
++static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
++ mmal_buffer_header_release(buffer);
++}
++
++static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
++ mmal_buffer_header_release(buffer);
++}
++
++
++static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt)
++{
++ switch (fmt) {
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ return MMAL_ENCODING_YUVUV128;
++ case AV_PIX_FMT_RPI4_10:
++ return MMAL_ENCODING_YUV10_COL;
++ case AV_PIX_FMT_SAND64_10:
++ return MMAL_ENCODING_YUVUV64_10;
++ case AV_PIX_FMT_SAND64_16:
++ return MMAL_ENCODING_YUVUV64_16;
++ case AV_PIX_FMT_YUV420P:
++ return MMAL_ENCODING_I420;
++
++ default:
++ break;
++ }
++ return 0;
++}
++
++
++static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt,
++ const AVFrame * const frame, const AVRpiZcRefPtr fr_ref)
++{
++ MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video;
++ const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref);
++ if (av_rpi_is_sand_format(geo->format)) {
++ // Sand formats are a bit "special"
++ // stride1 implicit in format
++ // width = stride2
++ vfmt->width = geo->stripe_is_yc ?
++ geo->height_y + geo->height_c : geo->height_y;
++// es->height = geo->video_height; //*** When we get the FLAG this will change
++ vfmt->height = geo->height_y;
++ es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE;
++ }
++ else {
++ vfmt->width = geo->stride_y / geo->bytes_per_pel;
++ vfmt->height = geo->height_y;
++ es_fmt->flags = 0;
++ }
++
++ es_fmt->type = MMAL_ES_TYPE_VIDEO;
++ es_fmt->encoding = mmfmt_from_avfmt(geo->format);
++ es_fmt->encoding_variant = 0;
++ es_fmt->bitrate = 0;
++
++ vfmt->crop.x = frame->crop_left;
++ vfmt->crop.y = frame->crop_top;
++ vfmt->crop.width = av_frame_cropped_width(frame);
++ vfmt->crop.height = av_frame_cropped_height(frame);
++
++ vfmt->frame_rate.den = 0; // Don't think I know it here
++ vfmt->frame_rate.num = 0;
++
++ vfmt->par.den = frame->sample_aspect_ratio.den;
++ vfmt->par.num = frame->sample_aspect_ratio.num;
++
++ vfmt->color_space = 0; // Unknown currently
++}
++
++static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
++{
++ rpi_display_env_t * const de = userdata;
++ if (buf->user_data != NULL) {
++ av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data);
++ buf->user_data = NULL;
++ }
++ atomic_fetch_add(&de->rpi_display_count, -1);
++ return MMAL_FALSE;
++}
++
++static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt)
++{
++ return avfmt == AV_PIX_FMT_SAND64_10;
++}
++
++static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de)
++{
++ if (de->isp != NULL)
++ {
++ if (de->isp->input[0]->is_enabled)
++ mmal_port_disable(de->isp->input[0]);
++ if (de->isp->control->is_enabled)
++ mmal_port_disable(de->isp->control);
++ }
++ if (de->conn != NULL) {
++ mmal_connection_destroy(de->conn);
++ de->conn = NULL;
++ }
++ if (de->isp != NULL) {
++ mmal_component_destroy(de->isp);
++ de->isp = NULL;
++ }
++}
++
++static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
++{
++ MMAL_BUFFER_HEADER_T* buf = NULL;
++ AVRpiZcRefPtr fr_buf = NULL;
++
++ if (de == NULL)
++ return;
++
++ if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
++ av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
++ return;
++ }
++
++ if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) {
++ return;
++ }
++
++ buf = mmal_queue_get(de->rpi_pool->queue);
++ if (!buf) {
++ // Running too fast so drop the frame (unexpected)
++ goto fail;
++ }
++
++ buf->cmd = 0;
++ buf->offset = 0;
++ buf->flags = 0;
++ mmal_buffer_header_reset(buf);
++
++ atomic_fetch_add(&de->rpi_display_count, 1); // Deced on release
++ mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de);
++
++ buf->user_data = fr_buf;
++ buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal
++ buf->offset = av_rpi_zc_offset(fr_buf);
++ buf->length = av_rpi_zc_length(fr_buf);
++ buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
++
++ while (de->show_all && atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
++ usleep(5000);
++ }
++
++ {
++ MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}};
++ MMAL_ES_FORMAT_T new_es = {.es = &new_ess};
++ MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video;
++
++ video_format_from_zc_frame(&new_es, fr, fr_buf);
++ if (de->req_fmt != new_es.encoding ||
++ de->req_vfmt.width != new_vfmt->width ||
++ de->req_vfmt.height != new_vfmt->height ||
++ de->req_vfmt.crop.x != new_vfmt->crop.x ||
++ de->req_vfmt.crop.y != new_vfmt->crop.y ||
++ de->req_vfmt.crop.width != new_vfmt->crop.width ||
++ de->req_vfmt.crop.height != new_vfmt->crop.height) {
++ // Something has changed
++
++ // If we have an ISP tear it down
++ isp_remove(s, de);
++ de->port_in = de->display->input[0];
++
++ // If we still need an ISP create it now
++ if (avfmt_needs_isp(fr->format))
++ {
++ if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "ISP creation failed\n");
++ goto fail;
++ }
++ de->port_in = de->isp->input[0];
++ }
++
++ mmal_format_copy(de->port_in->format, &new_es);
++
++ if (mmal_port_format_commit(de->port_in)) {
++ av_log(s, AV_LOG_ERROR, "Failed to commit input format\n");
++ goto fail;
++ }
++
++ // If we have an ISP then we must want to use it
++ if (de->isp != NULL) {
++ MMAL_PORT_T * const port_out = de->isp->output[0];
++ MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video;
++ MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video;
++
++ port_out->format->type = MMAL_ES_TYPE_VIDEO;
++ port_out->format->encoding = MMAL_ENCODING_YUVUV128;
++ port_out->format->encoding_variant = 0;
++ port_out->format->bitrate = 0;
++ port_out->format->flags = 0;
++ port_out->format->extradata = NULL;
++ port_out->format->extradata_size = 0;
++
++ vfmt_out->width = (vfmt_in->crop.width + 31) & ~31;
++ vfmt_out->height = (vfmt_in->crop.height + 15) & ~15;
++ vfmt_out->crop.x = 0;
++ vfmt_out->crop.y = 0;
++ vfmt_out->crop.width = vfmt_in->crop.width;
++ vfmt_out->crop.height = vfmt_in->crop.height;
++ vfmt_out->frame_rate = vfmt_in->frame_rate;
++ vfmt_out->par = vfmt_in->par;
++ vfmt_out->color_space = vfmt_in->color_space;
++
++ if (mmal_port_format_commit(port_out)) {
++ av_log(s, AV_LOG_ERROR, "Failed to commit output format\n");
++ goto fail;
++ }
++
++ if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) {
++ av_log(s, AV_LOG_ERROR, "Failed to create connection\n");
++ goto fail;
++ }
++ if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) {
++ av_log(s, AV_LOG_ERROR, "Failed to enable connection\n");
++ goto fail;
++ }
++ mmal_port_enable(de->isp->control,display_cb_control);
++ mmal_component_enable(de->isp);
++ }
++
++ // Number of slots in my port Q
++ de->port_in->buffer_num = DISPLAY_PORT_DEPTH;
++ // Size to keep it happy - isn't used for anything other than error checking
++ de->port_in->buffer_size = buf->alloc_size;
++ if (!de->port_in->is_enabled)
++ {
++ mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image?
++ if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) {
++ av_log(s, AV_LOG_ERROR, "Failed to enable input port\n");
++ goto fail;
++ }
++ }
++
++ de->req_fmt = new_es.encoding;
++ de->req_vfmt = *new_vfmt;
++ }
++ }
++
++ if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
++ goto fail;
++ }
++ return;
++
++fail:
++ // If we have a buf then fr_buf is held by that
++ if (buf != NULL)
++ mmal_buffer_header_release(buf);
++ else if (fr_buf != NULL)
++ av_rpi_zc_unref(fr_buf);
++}
++
++
++static int xv_write_trailer(AVFormatContext *s)
++{
++ rpi_display_env_t * const de = s->priv_data;
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++ if (de->port_in != NULL && de->port_in->is_enabled) {
++ mmal_port_disable(de->port_in);
++ }
++
++ // The above disable should kick out all buffers - check that
++ if (atomic_load(&de->rpi_display_count) != 0) {
++ av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
++ }
++
++ isp_remove(s, de);
++ if (de->rpi_pool != NULL) {
++ mmal_pool_destroy(de->rpi_pool);
++ de->rpi_pool = NULL;
++ }
++ if (de->display != NULL) {
++ mmal_component_destroy(de->display);
++ de->display = NULL;
++ }
++
++ return 0;
++}
++
++static int xv_write_header(AVFormatContext *s)
++{
++ rpi_display_env_t * const de = s->priv_data;
++ const AVCodecParameters * const par = s->streams[0]->codecpar;
++ const unsigned int w = de->window_width ? de->window_width : par->width;
++ const unsigned int h = de->window_height ? de->window_height : par->height;
++ const unsigned int x = de->window_x;
++ const unsigned int y = de->window_y;
++ const int layer = de->layer ? de->layer : 2;
++ const MMAL_BOOL_T fullscreen = de->fullscreen;
++
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h);
++#endif
++ if ( s->nb_streams > 1
++ || par->codec_type != AVMEDIA_TYPE_VIDEO
++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) {
++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++ return AVERROR(EINVAL);
++ }
++
++ {
++ MMAL_DISPLAYREGION_T region =
++ {
++ .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
++ .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN |
++ MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA,
++ .layer = layer,
++ .fullscreen = fullscreen,
++ .dest_rect = {x, y, w, h},
++ .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS,
++ };
++
++ bcm_host_init(); // Needs to be done by someone...
++
++ if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "Failed to create display component\n");
++ goto fail;
++ }
++ de->port_in = de->display->input[0];
++
++ mmal_port_parameter_set(de->display->input[0], &region.hdr);
++
++ if (mmal_component_enable(de->display) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "Failed to enable display component\n");
++ goto fail;
++ }
++ if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS)
++ {
++ av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n");
++ goto fail;
++ }
++
++ if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL)
++ {
++ av_log(s, AV_LOG_ERROR, "Failed to create pool\n");
++ goto fail;
++ }
++ }
++
++ return 0;
++
++fail:
++ xv_write_trailer(s);
++ return AVERROR_UNKNOWN;
++}
++
++static int xv_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++ AVFrame * const frame = (AVFrame *)pkt->data;
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++ display_frame(s, s->priv_data, frame);
++ return 0;
++}
++
++static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++ unsigned flags)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++ /* xv_write_header() should have accepted only supported formats */
++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++ return 0;
++// return write_picture(s, (*frame)->data, (*frame)->linesize);
++
++ display_frame(s, s->priv_data, *ppframe);
++ return 0;
++}
++
++static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
++#endif
++ switch(type) {
++ case AV_APP_TO_DEV_WINDOW_REPAINT:
++ return 0;
++ default:
++ break;
++ }
++ return AVERROR(ENOSYS);
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int rpi_vout_init(struct AVFormatContext * s)
++{
++ rpi_display_env_t * const de = s->priv_data;
++
++ // Get a ZC context in case we need one - has little overhead if unused
++ if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL)
++ return 1;
++
++ return 0;
++}
++
++static void rpi_vout_deinit(struct AVFormatContext * s)
++{
++ rpi_display_env_t * const de = s->priv_data;
++
++ av_rpi_zc_int_env_freep(&de->zc);
++}
++
++
++#define OFFSET(x) offsetof(rpi_display_env_t, x)
++static const AVOption options[] = {
++ { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "display_layer","set display layer", OFFSET(layer), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++ { NULL }
++
++};
++
++static const AVClass xv_class = {
++ .class_name = "rpi vid outdev",
++ .item_name = av_default_item_name,
++ .option = options,
++ .version = LIBAVUTIL_VERSION_INT,
++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_rpi_muxer = {
++ .name = "vout_rpi",
++ .long_name = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"),
++ .priv_data_size = sizeof(rpi_display_env_t),
++ .audio_codec = AV_CODEC_ID_NONE,
++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME,
++ .write_header = xv_write_header,
++ .write_packet = xv_write_packet,
++ .write_uncoded_frame = xv_write_frame,
++ .write_trailer = xv_write_trailer,
++ .control_message = xv_control_message,
++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++ .priv_class = &xv_class,
++ .init = rpi_vout_init,
++ .deinit = rpi_vout_deinit,
++};
+--- a/libavfilter/Makefile
++++ b/libavfilter/Makefile
+@@ -218,6 +218,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)
+ OBJS-$(CONFIG_DEFLICKER_FILTER) += vf_deflicker.o
+ OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER) += vf_deinterlace_qsv.o
+ OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER) += vf_deinterlace_vaapi.o vaapi_vpp.o
++OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER) += vf_deinterlace_v4l2m2m.o
+ OBJS-$(CONFIG_DEJUDDER_FILTER) += vf_dejudder.o
+ OBJS-$(CONFIG_DELOGO_FILTER) += vf_delogo.o
+ OBJS-$(CONFIG_DENOISE_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o
+@@ -434,6 +435,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)
+ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER) += vf_transpose_vaapi.o vaapi_vpp.o
+ OBJS-$(CONFIG_TRIM_FILTER) += trim.o
+ OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o
++OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o
+ OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o
+ OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \
+ opencl/unsharp.o
+--- a/libavfilter/allfilters.c
++++ b/libavfilter/allfilters.c
+@@ -204,6 +204,7 @@ extern AVFilter ff_vf_dedot;
+ extern AVFilter ff_vf_deflate;
+ extern AVFilter ff_vf_deflicker;
+ extern AVFilter ff_vf_deinterlace_qsv;
++extern AVFilter ff_vf_deinterlace_v4l2m2m;
+ extern AVFilter ff_vf_deinterlace_vaapi;
+ extern AVFilter ff_vf_dejudder;
+ extern AVFilter ff_vf_delogo;
+@@ -414,6 +415,7 @@ extern AVFilter ff_vf_transpose_opencl;
+ extern AVFilter ff_vf_transpose_vaapi;
+ extern AVFilter ff_vf_trim;
+ extern AVFilter ff_vf_unpremultiply;
++extern AVFilter ff_vf_unsand;
+ extern AVFilter ff_vf_unsharp;
+ extern AVFilter ff_vf_unsharp_opencl;
+ extern AVFilter ff_vf_untile;
+--- a/libavfilter/avfiltergraph.c
++++ b/libavfilter/avfiltergraph.c
+@@ -32,6 +32,9 @@
+ #include "libavutil/internal.h"
+ #include "libavutil/opt.h"
+ #include "libavutil/pixdesc.h"
++#if CONFIG_UNSAND_FILTER
++#include "libavutil/rpi_sand_fns.h"
++#endif
+
+ #define FF_INTERNAL_FIELDS 1
+ #include "framequeue.h"
+@@ -427,6 +430,19 @@ static int can_merge_formats(AVFilterFor
+ }
+ }
+
++#if CONFIG_UNSAND_FILTER
++static int has_sand_format(const AVFilterFormats * const ff)
++{
++ int i;
++ for (i = 0; i != ff->nb_formats; ++i) {
++ if (av_rpi_is_sand_format(ff->formats[i])) {
++ return 1;
++ }
++ }
++ return 0;
++}
++#endif
++
+ /**
+ * Perform one round of query_formats() and merging formats lists on the
+ * filter graph.
+@@ -467,6 +483,7 @@ static int query_formats(AVFilterGraph *
+ for (j = 0; j < filter->nb_inputs; j++) {
+ AVFilterLink *link = filter->inputs[j];
+ int convert_needed = 0;
++ unsigned int extra_convert_tried = 0;
+
+ if (!link)
+ continue;
+@@ -514,11 +531,14 @@ static int query_formats(AVFilterGraph *
+ )
+ #undef MERGE_DISPATCH
+
+- if (convert_needed) {
++ while (convert_needed) {
+ AVFilterContext *convert;
+ const AVFilter *filter;
+ AVFilterLink *inlink, *outlink;
+ char inst_name[30];
++ int can_retry = 0;
++
++ convert_needed = 0;
+
+ if (graph->disable_auto_convert) {
+ av_log(log_ctx, AV_LOG_ERROR,
+@@ -531,19 +551,45 @@ static int query_formats(AVFilterGraph *
+ /* couldn't merge format lists. auto-insert conversion filter */
+ switch (link->type) {
+ case AVMEDIA_TYPE_VIDEO:
+- if (!(filter = avfilter_get_by_name("scale"))) {
+- av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
+- "not present, cannot convert pixel formats.\n");
+- return AVERROR(EINVAL);
+- }
+-
+- snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
+- scaler_count++);
++#if CONFIG_UNSAND_FILTER
++ // Only try each extra conversion once
++ // The unsand output pad should never trigger has_sand_format
++ // but it is better to be safe
++ if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) {
++ if (!(filter = avfilter_get_by_name("unsand"))) {
++ av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
++ "not present, cannot convert pixel formats.\n");
++ return AVERROR(EINVAL);
++ }
++
++ snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
++ scaler_count++);
++
++ if ((ret = avfilter_graph_create_filter(&convert, filter,
++ inst_name, "", NULL,
++ graph)) < 0)
++ return ret;
+
+- if ((ret = avfilter_graph_create_filter(&convert, filter,
+- inst_name, graph->scale_sws_opts, NULL,
+- graph)) < 0)
+- return ret;
++ extra_convert_tried |= 1;
++ can_retry = 1;
++ }
++ else
++#endif
++ {
++ if (!(filter = avfilter_get_by_name("scale"))) {
++ av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
++ "not present, cannot convert pixel formats.\n");
++ return AVERROR(EINVAL);
++ }
++
++ snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
++ scaler_count++);
++
++ if ((ret = avfilter_graph_create_filter(&convert, filter,
++ inst_name, graph->scale_sws_opts, NULL,
++ graph)) < 0)
++ return ret;
++ }
+ break;
+ case AVMEDIA_TYPE_AUDIO:
+ if (!(filter = avfilter_get_by_name("aresample"))) {
+@@ -585,9 +631,19 @@ static int query_formats(AVFilterGraph *
+ av_assert0(outlink-> in_channel_layouts->refcount > 0);
+ av_assert0(outlink->out_channel_layouts->refcount > 0);
+ }
+- if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type) ||
+- !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
++ // If we have added an extra filter we must merge the input
++ // side but we can have another go at the output
++ if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type))
+ ret = AVERROR(ENOSYS);
++ else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
++ {
++ if (can_retry) {
++ link = outlink;
++ convert_needed = 1;
++ continue;
++ }
++ ret = AVERROR(ENOSYS);
++ }
+ if (inlink->type == AVMEDIA_TYPE_AUDIO &&
+ (!ff_merge_samplerates(inlink->in_samplerates,
+ inlink->out_samplerates) ||
+--- a/libavfilter/buffersrc.c
++++ b/libavfilter/buffersrc.c
+@@ -210,7 +210,7 @@ static int av_buffersrc_add_frame_intern
+
+ switch (ctx->outputs[0]->type) {
+ case AVMEDIA_TYPE_VIDEO:
+- CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
++ CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
+ frame->format, frame->pts);
+ break;
+ case AVMEDIA_TYPE_AUDIO:
+--- /dev/null
++++ b/libavfilter/vf_deinterlace_v4l2m2m.c
+@@ -0,0 +1,1336 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * deinterlace video filter - V4L2 M2M
++ */
++
++#include <drm_fourcc.h>
++
++#include <linux/videodev2.h>
++
++#include <dirent.h>
++#include <fcntl.h>
++#include <poll.h>
++#include <stdatomic.h>
++#include <stdio.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++#include "libavutil/avassert.h"
++#include "libavutil/avstring.h"
++#include "libavutil/common.h"
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavutil/internal.h"
++#include "libavutil/mathematics.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/time.h"
++
++#define FF_INTERNAL_FIELDS 1
++#include "framequeue.h"
++#include "filters.h"
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++typedef struct V4L2Queue V4L2Queue;
++typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
++
++typedef struct V4L2PlaneInfo {
++ int bytesperline;
++ size_t length;
++} V4L2PlaneInfo;
++
++typedef struct V4L2Buffer {
++ int enqueued;
++ int reenqueue;
++ int fd;
++ struct v4l2_buffer buffer;
++ AVFrame frame;
++ struct v4l2_plane planes[VIDEO_MAX_PLANES];
++ int num_planes;
++ V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
++ AVDRMFrameDescriptor drm_frame;
++ V4L2Queue *q;
++} V4L2Buffer;
++
++typedef struct V4L2Queue {
++ struct v4l2_format format;
++ int num_buffers;
++ V4L2Buffer *buffers;
++ DeintV4L2M2MContextShared *ctx;
++} V4L2Queue;
++
++typedef struct pts_stats_s
++{
++ void * logctx;
++ const char * name; // For debug
++ unsigned int last_count;
++ unsigned int last_interval;
++ int64_t last_pts;
++} pts_stats_t;
++
++#define PTS_TRACK_SIZE 32
++typedef struct pts_track_el_s
++{
++ uint32_t n;
++ unsigned int interval;
++ AVFrame * props;
++} pts_track_el_t;
++
++typedef struct pts_track_s
++{
++ uint32_t n;
++ uint32_t last_n;
++ int got_2;
++ void * logctx;
++ pts_stats_t stats;
++ pts_track_el_t a[PTS_TRACK_SIZE];
++} pts_track_t;
++
++typedef struct DeintV4L2M2MContextShared {
++ void * logctx; // For logging - will be NULL when done
++
++ int fd;
++ int done;
++ int width;
++ int height;
++ int orig_width;
++ int orig_height;
++ atomic_uint refcount;
++
++ AVBufferRef *hw_frames_ctx;
++
++ unsigned int field_order;
++
++ pts_track_t track;
++
++ V4L2Queue output;
++ V4L2Queue capture;
++} DeintV4L2M2MContextShared;
++
++typedef struct DeintV4L2M2MContext {
++ const AVClass *class;
++
++ DeintV4L2M2MContextShared *shared;
++} DeintV4L2M2MContext;
++
++static unsigned int pts_stats_interval(const pts_stats_t * const stats)
++{
++ return stats->last_interval;
++}
++
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++ if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++ if (stats->last_count < STATS_LAST_COUNT_MAX)
++ ++stats->last_count;
++ return;
++ }
++
++ if (stats->last_pts != AV_NOPTS_VALUE) {
++ const int64_t interval = pts - stats->last_pts;
++
++ if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++ stats->last_count >= STATS_LAST_COUNT_MAX) {
++ if (stats->last_interval != 0)
++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++ __func__, stats->name, interval, stats->last_count);
++ stats->last_interval = 0;
++ }
++ else {
++ const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++ if (frame_time != stats->last_interval)
++ av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++ __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++ stats->last_interval = frame_time;
++ }
++ }
++
++ stats->last_pts = pts;
++ stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++ *stats = (pts_stats_t){
++ .logctx = logctx,
++ .name = name,
++ .last_count = 1,
++ .last_interval = 0,
++ .last_pts = AV_NOPTS_VALUE
++ };
++}
++
++static inline uint32_t pts_track_next_n(pts_track_t * const trk)
++{
++ if (++trk->n == 0)
++ trk->n = 1;
++ return trk->n;
++}
++
++static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
++{
++ uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
++ pts_track_el_t * t;
++
++ // As a first guess assume that n==0 means last frame
++ if (n == 0) {
++ n = trk->last_n;
++ if (n == 0)
++ goto fail;
++ }
++
++ t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++ if (t->n != n) {
++ av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
++ goto fail;
++ }
++
++ // 1st frame is simple - just believe it
++ if (n != trk->last_n) {
++ trk->last_n = n;
++ trk->got_2 = 0;
++ return av_frame_copy_props(dst, t->props);
++ }
++
++ // Only believe in a single interpolated frame
++ if (trk->got_2)
++ goto fail;
++ trk->got_2 = 1;
++
++ av_frame_copy_props(dst, t->props);
++
++
++ // If we can't guess - don't
++ if (t->interval == 0) {
++ dst->best_effort_timestamp = AV_NOPTS_VALUE;
++ dst->pts = AV_NOPTS_VALUE;
++ dst->pkt_dts = AV_NOPTS_VALUE;
++ }
++ else {
++ if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
++ dst->best_effort_timestamp += t->interval / 2;
++ if (dst->pts != AV_NOPTS_VALUE)
++ dst->pts += t->interval / 2;
++ if (dst->pkt_dts != AV_NOPTS_VALUE)
++ dst->pkt_dts += t->interval / 2;
++ }
++
++ return 0;
++
++fail:
++ trk->last_n = 0;
++ trk->got_2 = 0;
++ dst->pts = AV_NOPTS_VALUE;
++ dst->pkt_dts = AV_NOPTS_VALUE;
++ return 0;
++}
++
++static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
++{
++ const uint32_t n = pts_track_next_n(trk);
++ pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++ pts_stats_add(&trk->stats, src->pts);
++
++ t->n = n;
++ t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
++ av_frame_unref(t->props);
++ av_frame_copy_props(t->props, src);
++
++ // We now know what the previous interval was, rather than having to guess,
++ // so set it. There is a better than decent chance that this is before
++ // we use it.
++ if (t->interval != 0) {
++ pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
++ prev_t->interval = t->interval;
++ }
++
++ // In case deinterlace interpolates frames use every other usec
++ return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
++}
++
++static void pts_track_uninit(pts_track_t * const trk)
++{
++ unsigned int i;
++ for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++ trk->a[i].n = 0;
++ av_frame_free(&trk->a[i].props);
++ }
++}
++
++static int pts_track_init(pts_track_t * const trk, void *logctx)
++{
++ unsigned int i;
++ trk->n = 1;
++ pts_stats_init(&trk->stats, logctx, "track");
++ for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++ trk->a[i].n = 0;
++ if ((trk->a[i].props = av_frame_alloc()) == NULL) {
++ pts_track_uninit(trk);
++ return AVERROR(ENOMEM);
++ }
++ }
++ return 0;
++}
++
++static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
++{
++ struct v4l2_capability cap;
++ int ret;
++
++ memset(&cap, 0, sizeof(cap));
++ ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
++ if (ret < 0)
++ return ret;
++
++ if (!(cap.capabilities & V4L2_CAP_STREAMING))
++ return AVERROR(EINVAL);
++
++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++
++ return 0;
++ }
++
++ if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
++ ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
++ ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++
++ return 0;
++ }
++
++ return AVERROR(EINVAL);
++}
++
++static int deint_v4l2m2m_try_format(V4L2Queue *queue)
++{
++ struct v4l2_format *fmt = &queue->format;
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ int ret, field;
++
++ ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
++ if (ret)
++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
++
++ if (V4L2_TYPE_IS_OUTPUT(fmt->type))
++ field = V4L2_FIELD_INTERLACED_TB;
++ else
++ field = V4L2_FIELD_NONE;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
++ fmt->fmt.pix_mp.field = field;
++ fmt->fmt.pix_mp.width = ctx->width;
++ fmt->fmt.pix_mp.height = ctx->height;
++ } else {
++ fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
++ fmt->fmt.pix.field = field;
++ fmt->fmt.pix.width = ctx->width;
++ fmt->fmt.pix.height = ctx->height;
++ }
++
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
++ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
++ fmt->fmt.pix_mp.pixelformat,
++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
++
++ ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
++ if (ret)
++ return AVERROR(EINVAL);
++
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
++ fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
++ fmt->fmt.pix_mp.pixelformat,
++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
++ fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
++ fmt->fmt.pix_mp.field != field) {
++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
++
++ return AVERROR(EINVAL);
++ }
++ } else {
++ if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
++ fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
++ fmt->fmt.pix.field != field) {
++ av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
++
++ return AVERROR(EINVAL);
++ }
++ }
++
++ return 0;
++}
++
++static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
++{
++ struct v4l2_format *fmt = &queue->format;
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ int ret;
++
++ struct v4l2_selection sel = {
++ .type = fmt->type,
++ .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
++ };
++
++ // This works for most single object 4:2:0 types
++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++ fmt->fmt.pix_mp.pixelformat = pixelformat;
++ fmt->fmt.pix_mp.field = field;
++ fmt->fmt.pix_mp.width = width;
++ fmt->fmt.pix_mp.height = ysize / pitch;
++ fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
++ fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
++ } else {
++ fmt->fmt.pix.pixelformat = pixelformat;
++ fmt->fmt.pix.field = field;
++ fmt->fmt.pix.width = width;
++ fmt->fmt.pix.height = height;
++ fmt->fmt.pix.sizeimage = 0;
++ fmt->fmt.pix.bytesperline = 0;
++ }
++
++ ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
++ if (ret) {
++ ret = AVERROR(errno);
++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
++ return ret;
++ }
++
++ if (pixelformat != fmt->fmt.pix.pixelformat) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
++ return AVERROR(EINVAL);
++ }
++
++ ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
++ if (ret) {
++ ret = AVERROR(errno);
++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
++ }
++
++ sel.r.width = width;
++ sel.r.height = height;
++ sel.r.left = 0;
++ sel.r.top = 0;
++ sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
++ sel.flags = V4L2_SEL_FLAG_LE;
++
++ ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
++ if (ret) {
++ ret = AVERROR(errno);
++ av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
++ }
++
++ return 0;
++}
++
++static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
++{
++ int ret;
++
++ ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
++ if (ctx->fd < 0)
++ return AVERROR(errno);
++
++ ret = deint_v4l2m2m_prepare_context(ctx);
++ if (ret)
++ goto fail;
++
++ ret = deint_v4l2m2m_try_format(&ctx->capture);
++ if (ret)
++ goto fail;
++
++ ret = deint_v4l2m2m_try_format(&ctx->output);
++ if (ret)
++ goto fail;
++
++ return 0;
++
++fail:
++ close(ctx->fd);
++ ctx->fd = -1;
++
++ return ret;
++}
++
++static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
++{
++ int ret = AVERROR(EINVAL);
++ struct dirent *entry;
++ char node[PATH_MAX];
++ DIR *dirp;
++
++ dirp = opendir("/dev");
++ if (!dirp)
++ return AVERROR(errno);
++
++ for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
++
++ if (strncmp(entry->d_name, "video", 5))
++ continue;
++
++ snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
++ av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
++ ret = deint_v4l2m2m_probe_device(ctx, node);
++ if (!ret)
++ break;
++ }
++
++ closedir(dirp);
++
++ if (ret) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
++ ctx->fd = -1;
++
++ return ret;
++ }
++
++ av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
++
++ return 0;
++}
++
++static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
++{
++ int ret;
++
++ ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
++ if (ret < 0)
++ return AVERROR(errno);
++
++ buf->enqueued = 1;
++
++ return 0;
++}
++
++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
++{
++ struct v4l2_exportbuffer expbuf;
++ int i, ret;
++ uint64_t mod = DRM_FORMAT_MOD_LINEAR;
++ uint32_t fmt = 0;
++
++ switch (pixelformat) {
++ case V4L2_PIX_FMT_NV12:
++ fmt = DRM_FORMAT_NV12;
++ break;
++ case V4L2_PIX_FMT_YUV420:
++ fmt = DRM_FORMAT_YUV420;
++ break;
++ default:
++ return AVERROR(EINVAL);
++ }
++
++ avbuf->drm_frame.layers[0].format = fmt;
++
++ for (i = 0; i < avbuf->num_planes; i++) {
++ memset(&expbuf, 0, sizeof(expbuf));
++
++ expbuf.index = avbuf->buffer.index;
++ expbuf.type = avbuf->buffer.type;
++ expbuf.plane = i;
++
++ ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
++ if (ret < 0)
++ return AVERROR(errno);
++
++ avbuf->fd = expbuf.fd;
++
++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
++ /* drm frame */
++ avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
++ avbuf->drm_frame.objects[i].fd = expbuf.fd;
++ avbuf->drm_frame.objects[i].format_modifier = mod;
++ } else {
++ /* drm frame */
++ avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
++ avbuf->drm_frame.objects[0].fd = expbuf.fd;
++ avbuf->drm_frame.objects[0].format_modifier = mod;
++ }
++ }
++
++ return 0;
++}
++
++static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
++{
++ struct v4l2_format *fmt = &queue->format;
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ struct v4l2_requestbuffers req;
++ int ret, i, j, multiplanar;
++ uint32_t memory;
++
++ memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
++ V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
++
++ multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
++
++ memset(&req, 0, sizeof(req));
++ req.count = queue->num_buffers;
++ req.memory = memory;
++ req.type = fmt->type;
++
++ ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
++ if (ret < 0) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
++
++ return AVERROR(errno);
++ }
++
++ queue->num_buffers = req.count;
++ queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
++ if (!queue->buffers) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
++
++ return AVERROR(ENOMEM);
++ }
++
++ for (i = 0; i < queue->num_buffers; i++) {
++ V4L2Buffer *buf = &queue->buffers[i];
++
++ buf->enqueued = 0;
++ buf->fd = -1;
++ buf->q = queue;
++
++ buf->buffer.type = fmt->type;
++ buf->buffer.memory = memory;
++ buf->buffer.index = i;
++
++ if (multiplanar) {
++ buf->buffer.length = VIDEO_MAX_PLANES;
++ buf->buffer.m.planes = buf->planes;
++ }
++
++ ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
++ if (ret < 0) {
++ ret = AVERROR(errno);
++
++ goto fail;
++ }
++
++ if (multiplanar)
++ buf->num_planes = buf->buffer.length;
++ else
++ buf->num_planes = 1;
++
++ for (j = 0; j < buf->num_planes; j++) {
++ V4L2PlaneInfo *info = &buf->plane_info[j];
++
++ if (multiplanar) {
++ info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
++ info->length = buf->buffer.m.planes[j].length;
++ } else {
++ info->bytesperline = fmt->fmt.pix.bytesperline;
++ info->length = buf->buffer.length;
++ }
++ }
++
++ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
++ ret = deint_v4l2m2m_enqueue_buffer(buf);
++ if (ret)
++ goto fail;
++
++ ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
++ if (ret)
++ goto fail;
++ }
++ }
++
++ return 0;
++
++fail:
++ for (i = 0; i < queue->num_buffers; i++)
++ if (queue->buffers[i].fd >= 0)
++ close(queue->buffers[i].fd);
++ av_free(queue->buffers);
++ queue->buffers = NULL;
++
++ return ret;
++}
++
++static int deint_v4l2m2m_streamon(V4L2Queue *queue)
++{
++ DeintV4L2M2MContextShared * const ctx = queue->ctx;
++ int type = queue->format.type;
++ int ret;
++
++ ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++ if (ret < 0)
++ return AVERROR(errno);
++
++ return 0;
++}
++
++static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
++{
++ DeintV4L2M2MContextShared * const ctx = queue->ctx;
++ int type = queue->format.type;
++ int ret;
++
++ ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++ if (ret < 0)
++ return AVERROR(errno);
++
++ return 0;
++}
++
++// timeout in ms
++static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
++{
++ struct v4l2_plane planes[VIDEO_MAX_PLANES];
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ struct v4l2_buffer buf = { 0 };
++ V4L2Buffer* avbuf = NULL;
++ struct pollfd pfd;
++ short events;
++ int ret;
++
++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++ events = POLLOUT | POLLWRNORM;
++ else
++ events = POLLIN | POLLRDNORM;
++
++ pfd.events = events;
++ pfd.fd = ctx->fd;
++
++ for (;;) {
++ ret = poll(&pfd, 1, timeout);
++ if (ret > 0)
++ break;
++ if (errno == EINTR)
++ continue;
++ return NULL;
++ }
++
++ if (pfd.revents & POLLERR)
++ return NULL;
++
++ if (pfd.revents & events) {
++ memset(&buf, 0, sizeof(buf));
++ buf.memory = V4L2_MEMORY_MMAP;
++ buf.type = queue->format.type;
++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++ memset(planes, 0, sizeof(planes));
++ buf.length = VIDEO_MAX_PLANES;
++ buf.m.planes = planes;
++ }
++
++ ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
++ if (ret) {
++ if (errno != EAGAIN)
++ av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
++ av_err2str(AVERROR(errno)));
++ return NULL;
++ }
++
++ avbuf = &queue->buffers[buf.index];
++ avbuf->enqueued = 0;
++ avbuf->buffer = buf;
++ if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++ memcpy(avbuf->planes, planes, sizeof(planes));
++ avbuf->buffer.m.planes = avbuf->planes;
++ }
++ return avbuf;
++ }
++
++ return NULL;
++}
++
++static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
++{
++ int i;
++ V4L2Buffer *buf = NULL;
++
++ for (i = 0; i < queue->num_buffers; i++)
++ if (!queue->buffers[i].enqueued) {
++ buf = &queue->buffers[i];
++ break;
++ }
++ return buf;
++}
++
++static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
++{
++ int i;
++ V4L2Buffer *buf = NULL;
++
++ if (!queue || !queue->buffers)
++ return;
++ for (i = 0; i < queue->num_buffers; i++) {
++ buf = &queue->buffers[i];
++ if (queue->buffers[i].enqueued)
++ av_frame_unref(&buf->frame);
++ }
++}
++
++static void recycle_q(V4L2Queue * const queue)
++{
++ V4L2Buffer* avbuf;
++ while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
++ av_frame_unref(&avbuf->frame);
++ }
++}
++
++static int count_enqueued(V4L2Queue *queue)
++{
++ int i;
++ int n = 0;
++
++ if (queue->buffers == NULL)
++ return 0;
++
++ for (i = 0; i < queue->num_buffers; i++)
++ if (queue->buffers[i].enqueued)
++ ++n;
++ return n;
++}
++
++static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
++{
++ DeintV4L2M2MContextShared *const ctx = queue->ctx;
++ AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
++ V4L2Buffer *buf;
++ int i;
++
++ if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++ recycle_q(queue);
++
++ buf = deint_v4l2m2m_find_free_buf(queue);
++ if (!buf) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
++ return AVERROR(EAGAIN);
++ }
++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
++ for (i = 0; i < drm_desc->nb_objects; i++)
++ buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
++ else
++ buf->buffer.m.fd = drm_desc->objects[0].fd;
++
++ buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
++ frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
++ V4L2_FIELD_INTERLACED_BT;
++
++ if (ctx->field_order != buf->buffer.field) {
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
++ ctx->field_order = buf->buffer.field;
++ }
++
++ buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
++
++ buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
++
++ av_frame_move_ref(&buf->frame, frame);
++
++ return deint_v4l2m2m_enqueue_buffer(buf);
++}
++
++static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
++{
++ if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
++ V4L2Queue *capture = &ctx->capture;
++ V4L2Queue *output = &ctx->output;
++ int i;
++
++ av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
++
++ if (ctx->fd >= 0) {
++ deint_v4l2m2m_streamoff(capture);
++ deint_v4l2m2m_streamoff(output);
++ }
++
++ if (capture->buffers)
++ for (i = 0; i < capture->num_buffers; i++) {
++ capture->buffers[i].q = NULL;
++ if (capture->buffers[i].fd >= 0)
++ close(capture->buffers[i].fd);
++ }
++
++ deint_v4l2m2m_unref_queued(output);
++
++ av_buffer_unref(&ctx->hw_frames_ctx);
++
++ if (capture->buffers)
++ av_free(capture->buffers);
++
++ if (output->buffers)
++ av_free(output->buffers);
++
++ if (ctx->fd >= 0) {
++ close(ctx->fd);
++ ctx->fd = -1;
++ }
++
++ av_free(ctx);
++ }
++}
++
++static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++{
++ V4L2Buffer *buf = opaque;
++ DeintV4L2M2MContextShared *ctx = buf->q->ctx;
++
++ if (!ctx->done)
++ deint_v4l2m2m_enqueue_buffer(buf);
++
++ deint_v4l2m2m_destroy_context(ctx);
++}
++
++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
++{
++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
++ AVDRMLayerDescriptor *layer;
++
++ /* fill the DRM frame descriptor */
++ drm_desc->nb_objects = avbuf->num_planes;
++ drm_desc->nb_layers = 1;
++
++ layer = &drm_desc->layers[0];
++ layer->nb_planes = avbuf->num_planes;
++
++ for (int i = 0; i < avbuf->num_planes; i++) {
++ layer->planes[i].object_index = i;
++ layer->planes[i].offset = 0;
++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
++ }
++
++ switch (layer->format) {
++ case DRM_FORMAT_YUYV:
++ layer->nb_planes = 1;
++ break;
++
++ case DRM_FORMAT_NV12:
++ case DRM_FORMAT_NV21:
++ if (avbuf->num_planes > 1)
++ break;
++
++ layer->nb_planes = 2;
++
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++ height;
++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
++ break;
++
++ case DRM_FORMAT_YUV420:
++ if (avbuf->num_planes > 1)
++ break;
++
++ layer->nb_planes = 3;
++
++ layer->planes[1].object_index = 0;
++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++ height;
++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
++
++ layer->planes[2].object_index = 0;
++ layer->planes[2].offset = layer->planes[1].offset +
++ ((avbuf->plane_info[0].bytesperline *
++ height) >> 2);
++ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
++ break;
++
++ default:
++ drm_desc->nb_layers = 0;
++ break;
++ }
++
++ return (uint8_t *) drm_desc;
++}
++
++// timeout in ms
++static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
++{
++ DeintV4L2M2MContextShared *ctx = queue->ctx;
++ V4L2Buffer* avbuf;
++
++ av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++ avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
++ if (!avbuf) {
++ av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
++ return AVERROR(EAGAIN);
++ }
++
++ // Fill in PTS and anciliary info from src frame
++ // we will want to overwrite some fields as only the pts/dts
++ // fields are updated with new timing in this fn
++ pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
++
++ frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
++ sizeof(avbuf->drm_frame), v4l2_free_buffer,
++ avbuf, AV_BUFFER_FLAG_READONLY);
++ if (!frame->buf[0]) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
++ return AVERROR(ENOMEM);
++ }
++
++ atomic_fetch_add(&ctx->refcount, 1);
++
++ frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
++ frame->format = AV_PIX_FMT_DRM_PRIME;
++ if (ctx->hw_frames_ctx)
++ frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
++ frame->height = ctx->height;
++ frame->width = ctx->width;
++
++ // Not interlaced now
++ frame->interlaced_frame = 0;
++ frame->top_field_first = 0;
++ // Pkt duration halved
++ frame->pkt_duration /= 2;
++
++ if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
++ av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
++ frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
++ }
++
++ av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
++ return 0;
++}
++
++static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
++{
++ AVFilterLink *inlink = outlink->src->inputs[0];
++ AVFilterContext *avctx = outlink->src;
++ DeintV4L2M2MContext *priv = avctx->priv;
++ DeintV4L2M2MContextShared *ctx = priv->shared;
++ int ret;
++
++ ctx->height = avctx->inputs[0]->h;
++ ctx->width = avctx->inputs[0]->w;
++
++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
++
++ outlink->time_base = inlink->time_base;
++ outlink->w = inlink->w;
++ outlink->h = inlink->h;
++ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
++ outlink->format = inlink->format;
++ outlink->frame_rate = (AVRational) {1, 0}; // Deny knowledge of frame rate
++
++ ret = deint_v4l2m2m_find_device(ctx);
++ if (ret)
++ return ret;
++
++ if (inlink->hw_frames_ctx) {
++ ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
++ if (!ctx->hw_frames_ctx)
++ return AVERROR(ENOMEM);
++ }
++ return 0;
++}
++
++static int deint_v4l2m2m_query_formats(AVFilterContext *avctx)
++{
++ static const enum AVPixelFormat pixel_formats[] = {
++ AV_PIX_FMT_DRM_PRIME,
++ AV_PIX_FMT_YUV420P,
++ AV_PIX_FMT_NONE,
++ };
++
++ return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats));
++}
++
++static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
++{
++ const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
++ drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
++
++ switch (drm_desc->layers[0].format) {
++ case DRM_FORMAT_YUV420:
++ if (is_linear)
++ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
++ break;
++ case DRM_FORMAT_NV12:
++ if (is_linear)
++ return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
++ break;
++ default:
++ break;
++ }
++ return 0;
++}
++
++static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
++{
++ AVFilterContext *avctx = link->dst;
++ DeintV4L2M2MContext *priv = avctx->priv;
++ DeintV4L2M2MContextShared *ctx = priv->shared;
++ V4L2Queue *capture = &ctx->capture;
++ V4L2Queue *output = &ctx->output;
++ int ret;
++
++ av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
++ __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
++ av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
++ avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
++
++ if (ctx->field_order == V4L2_FIELD_ANY) {
++ const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
++ const uint32_t pixelformat = desc_pixelformat(drm_desc);
++
++ if (pixelformat == 0) {
++ av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
++ av_fourcc2str(drm_desc->layers[0].format),
++ drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
++ return AVERROR(EINVAL);
++ }
++
++ ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
++ ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
++
++ av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
++ drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
++
++ ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_allocate_buffers(capture);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_streamon(capture);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_allocate_buffers(output);
++ if (ret)
++ return ret;
++
++ ret = deint_v4l2m2m_streamon(output);
++ if (ret)
++ return ret;
++
++ if (in->top_field_first)
++ ctx->field_order = V4L2_FIELD_INTERLACED_TB;
++ else
++ ctx->field_order = V4L2_FIELD_INTERLACED_BT;
++
++ }
++
++ ret = deint_v4l2m2m_enqueue_frame(output, in);
++
++ av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
++ return ret;
++}
++
++static int deint_v4l2m2m_activate(AVFilterContext *avctx)
++{
++ DeintV4L2M2MContext * const priv = avctx->priv;
++ DeintV4L2M2MContextShared *const s = priv->shared;
++ AVFilterLink * const outlink = avctx->outputs[0];
++ AVFilterLink * const inlink = avctx->inputs[0];
++ int n = 0;
++ int cn = 99;
++ int instatus = 0;
++ int64_t inpts = 0;
++ int did_something = 0;
++
++ av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++ FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
++
++ ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
++
++ if (!ff_outlink_frame_wanted(outlink)) {
++ av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
++ }
++ else if (s->field_order != V4L2_FIELD_ANY) // Can't DQ if no setup!
++ {
++ AVFrame * frame = av_frame_alloc();
++ int rv;
++
++again:
++ recycle_q(&s->output);
++ n = count_enqueued(&s->output);
++
++ if (frame == NULL) {
++ av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
++ return AVERROR(ENOMEM);
++ }
++
++ rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
++ if (rv != 0) {
++ av_frame_free(&frame);
++ if (rv != AVERROR(EAGAIN)) {
++ av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
++ return rv;
++ }
++ }
++ else {
++ frame->interlaced_frame = 0;
++ // frame is always consumed by filter_frame - even on error despite
++ // a somewhat confusing comment in the header
++ rv = ff_filter_frame(outlink, frame);
++
++ if (instatus != 0) {
++ av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
++ goto again;
++ }
++
++ av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
++ did_something = 1;
++ }
++
++ cn = count_enqueued(&s->capture);
++ }
++
++ if (instatus != 0) {
++ ff_outlink_set_status(outlink, instatus, inpts);
++ av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
++ return 0;
++ }
++
++ recycle_q(&s->output);
++ n = count_enqueued(&s->output);
++
++ while (n < 6) {
++ AVFrame * frame;
++ int rv;
++
++ if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
++ av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
++ return rv;
++ }
++
++ if (frame == NULL) {
++ av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
++ break;
++ }
++
++ rv = deint_v4l2m2m_filter_frame(inlink, frame);
++ av_frame_free(&frame);
++
++ if (rv != 0)
++ return rv;
++
++ av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
++ ++n;
++ }
++
++ if (n < 6) {
++ ff_inlink_request_frame(inlink);
++ did_something = 1;
++ av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
++ }
++
++ if (n > 4 && ff_outlink_frame_wanted(outlink)) {
++ ff_filter_set_ready(avctx, 1);
++ did_something = 1;
++ av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
++ }
++
++ av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
++ return did_something ? 0 : FFERROR_NOT_READY;
++}
++
++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
++{
++ DeintV4L2M2MContext * const priv = avctx->priv;
++ DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
++
++ if (!ctx) {
++ av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
++ return AVERROR(ENOMEM);
++ }
++ priv->shared = ctx;
++ ctx->logctx = priv;
++ ctx->fd = -1;
++ ctx->output.ctx = ctx;
++ ctx->output.num_buffers = 8;
++ ctx->capture.ctx = ctx;
++ ctx->capture.num_buffers = 12;
++ ctx->done = 0;
++ ctx->field_order = V4L2_FIELD_ANY;
++
++ pts_track_init(&ctx->track, priv);
++
++ atomic_init(&ctx->refcount, 1);
++
++ return 0;
++}
++
++static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
++{
++ DeintV4L2M2MContext *priv = avctx->priv;
++ DeintV4L2M2MContextShared *ctx = priv->shared;
++
++ ctx->done = 1;
++ ctx->logctx = NULL; // Log to NULL works, log to missing crashes
++ pts_track_uninit(&ctx->track);
++ deint_v4l2m2m_destroy_context(ctx);
++}
++
++static const AVOption deinterlace_v4l2m2m_options[] = {
++ { NULL },
++};
++
++AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
++
++static const AVFilterPad deint_v4l2m2m_inputs[] = {
++ {
++ .name = "default",
++ .type = AVMEDIA_TYPE_VIDEO,
++ },
++ { NULL }
++};
++
++static const AVFilterPad deint_v4l2m2m_outputs[] = {
++ {
++ .name = "default",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .config_props = deint_v4l2m2m_config_props,
++ },
++ { NULL }
++};
++
++AVFilter ff_vf_deinterlace_v4l2m2m = {
++ .name = "deinterlace_v4l2m2m",
++ .description = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
++ .priv_size = sizeof(DeintV4L2M2MContext),
++ .init = &deint_v4l2m2m_init,
++ .uninit = &deint_v4l2m2m_uninit,
++ .query_formats = &deint_v4l2m2m_query_formats,
++ .inputs = deint_v4l2m2m_inputs,
++ .outputs = deint_v4l2m2m_outputs,
++ .priv_class = &deinterlace_v4l2m2m_class,
++ .activate = deint_v4l2m2m_activate,
++};
+--- /dev/null
++++ b/libavfilter/vf_unsand.c
+@@ -0,0 +1,234 @@
++/*
++ * Copyright (c) 2007 Bobby Bingham
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * format and noformat video filters
++ */
++
++#include <string.h>
++
++#include "libavutil/internal.h"
++#include "libavutil/mem.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/opt.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++typedef struct UnsandContext {
++ const AVClass *class;
++} UnsandContext;
++
++static av_cold void uninit(AVFilterContext *ctx)
++{
++// UnsandContext *s = ctx->priv;
++}
++
++static av_cold int init(AVFilterContext *ctx)
++{
++// UnsandContext *s = ctx->priv;
++
++ return 0;
++}
++
++
++static int filter_frame(AVFilterLink *link, AVFrame *in)
++{
++ AVFilterLink * const outlink = link->dst->outputs[0];
++ AVFrame *out = NULL;
++ int rv = 0;
++
++ if (outlink->format == in->format) {
++ // If nothing to do then do nothing
++ out = in;
++ }
++ else
++ {
++ if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
++ {
++ rv = AVERROR(ENOMEM);
++ goto fail;
++ }
++ if (av_rpi_sand_to_planar_frame(out, in) != 0)
++ {
++ rv = -1;
++ goto fail;
++ }
++
++ av_frame_free(&in);
++ }
++
++ return ff_filter_frame(outlink, out);
++
++fail:
++ av_frame_free(&out);
++ av_frame_free(&in);
++ return rv;
++}
++
++#if 0
++static void dump_fmts(const AVFilterFormats * fmts)
++{
++ int i;
++ if (fmts== NULL) {
++ printf("NULL\n");
++ return;
++ }
++ for (i = 0; i < fmts->nb_formats; ++i) {
++ printf(" %d", fmts->formats[i]);
++ }
++ printf("\n");
++}
++#endif
++
++static int query_formats(AVFilterContext *ctx)
++{
++// UnsandContext *s = ctx->priv;
++ int ret;
++
++ // If we aren't connected at both ends then just do nothing
++ if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
++ return 0;
++
++// printf("Unsand: %s in: ", __func__);
++// dump_fmts(ctx->inputs[0]->in_formats);
++// printf("Unsand: %s out: ", __func__);
++// dump_fmts(ctx->outputs[0]->out_formats);
++
++ // Our output formats depend on our input formats and we can't/don't
++ // want to convert between bit depths so we need to wait for the source
++ // to have an opinion before we do
++ if (ctx->inputs[0]->in_formats == NULL)
++ return AVERROR(EAGAIN);
++
++ // Accept anything
++ if (ctx->inputs[0]->out_formats == NULL &&
++ (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0)
++ return ret;
++
++ // Filter out sand formats
++
++ // Generate a container if we don't already have one
++ if (ctx->outputs[0]->in_formats == NULL)
++ {
++ // Somewhat rubbish way of ensuring we have a good structure
++ const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
++ AVFilterFormats *formats = ff_make_format_list(out_fmts);
++
++ if (formats == NULL)
++ return AVERROR(ENOMEM);
++ if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0)
++ return ret;
++ }
++
++ // Replace old format list with new filtered list derived from what our
++ // input says it can do
++ {
++ const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats;
++ AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats;
++ enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
++ int i;
++ int n = 0;
++ int seen_420p = 0;
++ int seen_420p10 = 0;
++
++ for (i = 0; i < src_ff->nb_formats; ++i) {
++ const enum AVPixelFormat f = src_ff->formats[i];
++
++ switch (f){
++ case AV_PIX_FMT_YUV420P:
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ if (!seen_420p) {
++ seen_420p = 1;
++ dst_fmts[n++] = AV_PIX_FMT_YUV420P;
++ }
++ break;
++ case AV_PIX_FMT_SAND64_10:
++ case AV_PIX_FMT_YUV420P10:
++ case AV_PIX_FMT_RPI4_10:
++ if (!seen_420p10) {
++ seen_420p10 = 1;
++ dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
++ }
++ break;
++ default:
++ dst_fmts[n++] = f;
++ break;
++ }
++ }
++
++ av_freep(&dst_ff->formats);
++ dst_ff->formats = dst_fmts;
++ dst_ff->nb_formats = n;
++ }
++
++// printf("Unsand: %s calc: ", __func__);
++// dump_fmts(ctx->outputs[0]->in_formats);
++
++ return 0;
++}
++
++
++#define OFFSET(x) offsetof(UnsandContext, x)
++static const AVOption unsand_options[] = {
++ { NULL }
++};
++
++
++AVFILTER_DEFINE_CLASS(unsand);
++
++static const AVFilterPad avfilter_vf_unsand_inputs[] = {
++ {
++ .name = "default",
++ .type = AVMEDIA_TYPE_VIDEO,
++ .filter_frame = filter_frame,
++ },
++ { NULL }
++};
++
++static const AVFilterPad avfilter_vf_unsand_outputs[] = {
++ {
++ .name = "default",
++ .type = AVMEDIA_TYPE_VIDEO
++ },
++ { NULL }
++};
++
++AVFilter ff_vf_unsand = {
++ .name = "unsand",
++ .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
++
++ .init = init,
++ .uninit = uninit,
++
++ .query_formats = query_formats,
++
++ .priv_size = sizeof(UnsandContext),
++ .priv_class = &unsand_class,
++
++ .inputs = avfilter_vf_unsand_inputs,
++ .outputs = avfilter_vf_unsand_outputs,
++};
++
+--- a/libavformat/utils.c
++++ b/libavformat/utils.c
+@@ -3051,6 +3051,40 @@ static int has_codec_parameters(AVStream
+ return 1;
+ }
+
++#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
++// This should be quite general purpose but avoid possible conflicts
++// by limiting usage to cases wehere we know it works.
++static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
++{
++ // Only try fallback if we know it is supported (HEVC only)
++ const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
++ avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
++ int err;
++
++ // Failed to find fallback or we are already at the fallback
++ if (new_codec == NULL || new_codec == old_codec)
++ {
++ return AVERROR_DECODER_NOT_FOUND;
++ }
++
++ // * This may be dodgy - header says to not use this fn,
++ // especially if we are going to reopen the context...
++ // (but it does seem to work for our cases)
++ if (avcodec_is_open(avctx)) {
++ avcodec_close(avctx);
++ }
++
++ if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
++ {
++ return err;
++ }
++
++ return 0;
++}
++#else
++#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
++#endif
++
+ /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
+ static int try_decode_frame(AVFormatContext *s, AVStream *st,
+ const AVPacket *avpkt, AVDictionary **options)
+@@ -3085,7 +3119,11 @@ static int try_decode_frame(AVFormatCont
+ av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
+ if (s->codec_whitelist)
+ av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
+- ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
++ if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
++ {
++ // Try fallback if if looks worth a try
++ ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
++ }
+ if (!options)
+ av_dict_free(&thread_opt);
+ if (ret < 0) {
+@@ -3116,6 +3154,14 @@ static int try_decode_frame(AVFormatCont
+ if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
+ avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
+ ret = avcodec_send_packet(avctx, &pkt);
++
++ // If we are going to want to fall back we should know here
++ if (ret == AVERROR_DECODER_NOT_FOUND) {
++ if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
++ break;
++ continue;
++ }
++
+ if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
+ break;
+ if (ret >= 0)
+@@ -3726,9 +3772,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
+ // Try to just open decoders, in case this is enough to get parameters.
+ if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
+ if (codec && !avctx->codec)
+- if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
+- av_log(ic, AV_LOG_WARNING,
+- "Failed to open codec in %s\n",__FUNCTION__);
++ {
++ int err;
++
++ if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
++ {
++ if (err == AVERROR_DECODER_NOT_FOUND) {
++ err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
++ }
++ if (err < 0) {
++ av_log(ic, AV_LOG_WARNING,
++ "Failed to open codec in %s\n",__FUNCTION__);
++ }
++ }
++ }
+ }
+ if (!options)
+ av_dict_free(&thread_opt);
+--- a/libavutil/Makefile
++++ b/libavutil/Makefile
+@@ -68,6 +68,7 @@ HEADERS = adler32.h
+ rational.h \
+ replaygain.h \
+ ripemd.h \
++ rpi_sand_fns.h \
+ samplefmt.h \
+ sha.h \
+ sha512.h \
+@@ -86,6 +87,7 @@ HEADERS = adler32.h
+ tx.h \
+
+ HEADERS-$(CONFIG_LZO) += lzo.h
++HEADERS-$(CONFIG-RPI) += rpi_sand_fn_pw.h
+
+ ARCH_HEADERS = bswap.h \
+ intmath.h \
+@@ -180,6 +182,7 @@ OBJS-$(CONFIG_LZO)
+ OBJS-$(CONFIG_MEDIACODEC) += hwcontext_mediacodec.o
+ OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o
+ OBJS-$(CONFIG_QSV) += hwcontext_qsv.o
++OBJS-$(CONFIG_SAND) += rpi_sand_fns.o
+ OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o
+ OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o
+ OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o
+--- a/libavutil/aarch64/Makefile
++++ b/libavutil/aarch64/Makefile
+@@ -1,4 +1,6 @@
+ OBJS += aarch64/cpu.o \
+ aarch64/float_dsp_init.o \
+
+-NEON-OBJS += aarch64/float_dsp_neon.o
++NEON-OBJS += aarch64/float_dsp_neon.o \
++ aarch64/rpi_sand_neon.o \
++
+--- /dev/null
++++ b/libavutil/aarch64/rpi_sand_neon.S
+@@ -0,0 +1,781 @@
++/*
++Copyright (c) 2021 Michael Eiler
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: Michael Eiler <eiler.mike@gmail.com>
++*/
++
++#include "asm.S"
++
++// void ff_rpi_sand8_lines_to_planar_y8(
++// uint8_t * dest, : x0
++// unsigned int dst_stride, : w1
++// const uint8_t * src, : x2
++// unsigned int src_stride1, : w3, always 128
++// unsigned int src_stride2, : w4
++// unsigned int _x, : w5
++// unsigned int y, : w6
++// unsigned int _w, : w7
++// unsigned int h); : [sp, #0]
++
++function ff_rpi_sand8_lines_to_planar_y8, export=1
++ // w15 contains the number of rows we need to process
++ ldr w15, [sp, #0]
++
++ // w8 will contain the number of blocks per row
++ // w8 = floor(_w/stride1)
++ // stride1 is assumed to always be 128
++ mov w8, w1
++ lsr w8, w8, #7
++
++ // in case the width of the image is not a multiple of 128, there will
++ // be an incomplete block at the end of every row
++ // w9 contains the number of pixels stored within this block
++ // w9 = _w - w8 * 128
++ lsl w9, w8, #7
++ sub w9, w7, w9
++
++ // this is the value we have to add to the src pointer after reading a complete block
++ // it will move the address to the start of the next block
++ // w10 = stride2 * stride1 - stride1
++ mov w10, w4
++ lsl w10, w10, #7
++ sub w10, w10, #128
++
++ // w11 is the row offset, meaning the start offset of the first block of every collumn
++ // this will be increased with stride1 within every iteration of the row_loop
++ eor w11, w11, w11
++
++ // w12 = 0, processed row count
++ eor w12, w12, w12
++row_loop:
++ // start of the first block within the current row
++ // x13 = row offset + src
++ mov x13, x2
++ add x13, x13, x11
++
++ // w14 = 0, processed block count
++ eor w14, w14, w14
++
++ cmp w8, #0
++ beq no_main_y8
++
++block_loop:
++ // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
++ // fortunately these aren't callee saved ones, meaning we don't need to backup them
++ ld1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x13], #64
++ ld1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x13], #64
++
++ // write these registers back to the destination vector and increase the dst address by 128
++ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64
++ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x0], #64
++
++ // move the source register to the beginning of the next block (x13 = src + block offset)
++ add x13, x13, x10
++ // increase the block counter
++ add w14, w14, #1
++
++ // continue with the block_loop if we haven't copied all full blocks yet
++ cmp w8, w14
++ bgt block_loop
++
++ // handle the last block at the end of each row
++ // at most 127 byte values copied from src to dst
++no_main_y8:
++ eor w5, w5, w5 // i = 0
++incomplete_block_loop_y8:
++ cmp w5, w9
++ bge incomplete_block_loop_end_y8
++
++ ldrb w6, [x13]
++ strb w6, [x0]
++ add x13, x13, #1
++ add x0, x0, #1
++
++ add w5, w5, #1
++ b incomplete_block_loop_y8
++incomplete_block_loop_end_y8:
++
++
++ // increase the row offset by 128 (stride1)
++ add w11, w11, #128
++ // increment the row counter
++ add w12, w12, #1
++
++ // process the next row if we haven't finished yet
++ cmp w15, w12
++ bgt row_loop
++
++ ret
++endfunc
++
++
++
++// void ff_rpi_sand8_lines_to_planar_c8(
++// uint8_t * dst_u, : x0
++// unsigned int dst_stride_u, : w1 == width
++// uint8_t * dst_v, : x2
++// unsigned int dst_stride_v, : w3 == width
++// const uint8_t * src, : x4
++// unsigned int stride1, : w5 == 128
++// unsigned int stride2, : w6
++// unsigned int _x, : w7
++// unsigned int y, : [sp, #0]
++// unsigned int _w, : [sp, #8]
++// unsigned int h); : [sp, #16]
++
++function ff_rpi_sand8_lines_to_planar_c8, export=1
++ // w7 = width
++ ldr w7, [sp, #8]
++
++ // w15 contains the number of rows we need to process
++ // counts down
++ ldr w15, [sp, #16]
++
++ // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
++ mov w8, w7
++ lsr w8, w8, #6
++
++ // number of pixels in block at the end of every row
++ // w9 = _w - (w8 * 64)
++ lsl w9, w8, #6
++ sub w9, w7, w9
++
++ // Skip at the end of the line to account for stride
++ sub w12, w1, w7
++
++ // address delta to the beginning of the next block
++ // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
++ lsl w10, w6, #7
++ sub w10, w10, #128
++
++ // w11 = row address start offset = 0
++ eor w11, w11, w11
++
++row_loop_c8:
++ // start of the first block within the current row
++ // x13 = row offset + src
++ mov x13, x4
++ add x13, x13, x11
++
++ // w14 = 0, processed block count
++ eor w14, w14, w14
++
++ cmp w8, #0
++ beq no_main_c8
++
++block_loop_c8:
++ // load the full block -> 128 bytes, the block contains 64 interleaved U and V values
++ ld2 { v0.16b, v1.16b }, [x13], #32
++ ld2 { v2.16b, v3.16b }, [x13], #32
++ ld2 { v4.16b, v5.16b }, [x13], #32
++ ld2 { v6.16b, v7.16b }, [x13], #32
++
++ // swap register so that we can write them out with a single instruction
++ mov v16.16b, v1.16b
++ mov v17.16b, v3.16b
++ mov v18.16b, v5.16b
++ mov v1.16b, v2.16b
++ mov v2.16b, v4.16b
++ mov v3.16b, v6.16b
++ mov v4.16b, v16.16b
++ mov v5.16b, v17.16b
++ mov v6.16b, v18.16b
++
++ st1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64
++ st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x2], #64
++
++ // increment row counter and move src to the beginning of the next block
++ add w14, w14, #1
++ add x13, x13, x10
++
++ // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
++ cmp w8, w14
++ bgt block_loop_c8
++
++no_main_c8:
++ // handle incomplete block at the end of every row
++ eor w5, w5, w5 // point counter, this might be
++incomplete_block_loop_c8:
++ cmp w5, w9
++ bge incomplete_block_loop_end_c8
++
++ ldrb w1, [x13]
++ strb w1, [x0]
++ add x13, x13, #1
++
++ ldrb w1, [x13]
++ strb w1, [x2]
++ add x13, x13, #1
++
++ add x0, x0, #1
++ add x2, x2, #1
++
++ add w5, w5, #1
++ b incomplete_block_loop_c8
++incomplete_block_loop_end_c8:
++
++ // increase row_offset by stride1
++ add w11, w11, #128
++ add x0, x0, w12, sxtw
++ add x2, x2, w12, sxtw
++
++ // jump to row_Loop_c8 iff the row count is small than the height
++ subs w15, w15, #1
++ bgt row_loop_c8
++
++ ret
++endfunc
++
++//void ff_rpi_sand30_lines_to_planar_c16(
++// uint8_t * dst_u, // [x0]
++// unsigned int dst_stride_u, // [w1] == _w*2
++// uint8_t * dst_v, // [x2]
++// unsigned int dst_stride_v, // [w3] == _w*2
++// const uint8_t * src, // [x4]
++// unsigned int stride1, // [w5] == 128
++// unsigned int stride2, // [w6]
++// unsigned int _x, // [w7] == 0
++// unsigned int y, // [sp, #0] == 0
++// unsigned int _w, // [sp, #8] -> w3
++// unsigned int h); // [sp, #16] -> w7
++
++.macro rpi_sand30_lines_to_planar_c16_block_half
++ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64
++
++ xtn v4.4h, v0.4s
++ ushr v0.4s, v0.4s, #10
++ xtn v5.4h, v0.4s
++ ushr v0.4s, v0.4s, #10
++ xtn v6.4h, v0.4s
++ xtn2 v4.8h, v1.4s
++ ushr v1.4s, v1.4s, #10
++ xtn2 v5.8h, v1.4s
++ ushr v1.4s, v1.4s, #10
++ xtn2 v6.8h, v1.4s
++ and v4.16b, v4.16b, v16.16b
++ and v5.16b, v5.16b, v16.16b
++ and v6.16b, v6.16b, v16.16b
++ st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
++
++ xtn v4.4h, v2.4s
++ ushr v2.4s, v2.4s, #10
++ xtn v5.4h, v2.4s
++ ushr v2.4s, v2.4s, #10
++ xtn v6.4h, v2.4s
++ xtn2 v4.8h, v3.4s
++ ushr v3.4s, v3.4s, #10
++ xtn2 v5.8h, v3.4s
++ ushr v3.4s, v3.4s, #10
++ xtn2 v6.8h, v3.4s
++ and v4.16b, v4.16b, v16.16b
++ and v5.16b, v5.16b, v16.16b
++ and v6.16b, v6.16b, v16.16b
++ st3 { v4.8h, v5.8h, v6.8h }, [sp]
++ sub sp, sp, #48
++.endm
++
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++ stp x19, x20, [sp, #-48]!
++ stp x21, x22, [sp, #16]
++ stp x23, x24, [sp, #32]
++
++ ldr w3, [sp, #48+8] // w3 = width
++ ldr w7, [sp, #48+16] // w7 = height
++
++ // reserve space on the stack for intermediate results
++ sub sp, sp, #256
++
++ // number of 128byte blocks per row, w8 = width / 48
++ mov w9, #48
++ udiv w8, w3, w9
++
++ // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
++ mul w9, w8, w9
++ sub w9, w3, w9
++
++ // row offset, the beginning of the next row to process
++ eor w10, w10, w10
++
++ // offset to the beginning of the next block, w11 = stride2 * 128 - 128
++ lsl w11, w6, #7
++ sub w11, w11, #128
++
++ // decrease the height by one and in case of remaining pixels increase the block count by one
++ sub w7, w7, #1
++ cmp w9, #0
++ cset w19, ne // w19 == 1 iff reamining pixels != 0
++ add w8, w8, w19
++
++ // bytes we have to move dst back by at the end of every row
++ mov w21, #48*2
++ mul w21, w21, w8
++ sub w21, w1, w21
++
++ mov w20, #0 // w20 = flag, last row processed
++
++ mov x12, #0x03ff03ff03ff03ff
++ dup v16.2d, x12
++
++ // iterate through rows, row counter = w12 = 0
++ eor w12, w12, w12
++row_loop_c16:
++ cmp w12, w7
++ bge row_loop_c16_fin
++
++ // address of row data = src + row_offset
++ mov x13, x4
++ add x13, x13, x10
++
++ eor w14, w14, w14
++block_loop_c16:
++ cmp w14, w8
++ bge block_loop_c16_fin
++
++ rpi_sand30_lines_to_planar_c16_block_half
++
++ ld2 { v0.8h, v1.8h }, [sp], #32
++ ld2 { v2.8h, v3.8h }, [sp], #32
++ ld2 { v4.8h, v5.8h }, [sp]
++ sub sp, sp, #64
++
++ st1 { v0.8h }, [x0], #16
++ st1 { v2.8h }, [x0], #16
++ st1 { v4.8h }, [x0], #16
++ st1 { v1.8h }, [x2], #16
++ st1 { v3.8h }, [x2], #16
++ st1 { v5.8h }, [x2], #16
++
++ rpi_sand30_lines_to_planar_c16_block_half
++
++ ld2 { v0.8h, v1.8h }, [sp], #32
++ ld2 { v2.8h, v3.8h }, [sp], #32
++ ld2 { v4.8h, v5.8h }, [sp]
++ sub sp, sp, #64
++
++ st1 { v0.8h }, [x0], #16
++ st1 { v2.8h }, [x0], #16
++ st1 { v4.8h }, [x0], #16
++ st1 { v1.8h }, [x2], #16
++ st1 { v3.8h }, [x2], #16
++ st1 { v5.8h }, [x2], #16
++
++ add x13, x13, x11 // offset to next block
++ add w14, w14, #1
++ b block_loop_c16
++block_loop_c16_fin:
++
++ add w10, w10, #128
++ add w12, w12, #1
++ add x0, x0, w21, sxtw // move dst pointers back by x21
++ add x2, x2, w21, sxtw
++ b row_loop_c16
++row_loop_c16_fin:
++
++ cmp w20, #1
++ beq row_loop_c16_fin2
++ mov w20, #1
++ sub w8, w8, w19 // decrease block count by w19
++ add w7, w7, #1 // increase height
++ b row_loop_c16
++
++row_loop_c16_fin2:
++ sub x0, x0, w21, sxtw // readd x21 in case of the last row
++ sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
++
++ // last incomplete block to be finished
++ // read operations are fine, stride2 is more than large enough even if rem_pix is 0
++ rpi_sand30_lines_to_planar_c16_block_half
++ ld2 { v0.8h, v1.8h }, [sp], #32
++ ld2 { v2.8h, v3.8h }, [sp], #32
++ ld2 { v4.8h, v5.8h }, [sp], #32
++ rpi_sand30_lines_to_planar_c16_block_half
++ ld2 { v0.8h, v1.8h }, [sp], #32
++ ld2 { v2.8h, v3.8h }, [sp], #32
++ ld2 { v4.8h, v5.8h }, [sp]
++ sub sp, sp, #160
++
++ mov x4, sp
++ eor w20, w20, w20
++rem_pix_c16_loop:
++ cmp w20, w9
++ bge rem_pix_c16_fin
++
++ ldr w22, [x4], #4
++ str w22, [x0], #2
++ lsr w22, w22, #16
++ str w22, [x2], #2
++
++ add w20, w20, #1
++ b rem_pix_c16_loop
++rem_pix_c16_fin:
++
++ add sp, sp, #256
++
++ ldp x23, x24, [sp, #32]
++ ldp x21, x22, [sp, #16]
++ ldp x19, x20, [sp], #48
++ ret
++endfunc
++
++
++
++//void ff_rpi_sand30_lines_to_planar_p010(
++// uint8_t * dest,
++// unsigned int dst_stride,
++// const uint8_t * src,
++// unsigned int src_stride1,
++// unsigned int src_stride2,
++// unsigned int _x,
++// unsigned int y,
++// unsigned int _w,
++// unsigned int h);
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++// uint8_t * dest, : x0
++// unsigned int dst_stride, : w1
++// const uint8_t * src, : x2
++// unsigned int src_stride1, : w3, always 128
++// unsigned int src_stride2, : w4
++// unsigned int _x, : w5
++// unsigned int y, : w6
++// unsigned int _w, : w7
++// unsigned int h); : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++ lsl w4, w4, #7
++ sub w4, w4, #64
++ sub w1, w1, w7, lsl #1
++ uxtw x6, w6
++ add x8, x2, x6, lsl #7
++ ldr w6, [sp, #0]
++
++10:
++ mov x2, x8
++ mov w5, w7
++1:
++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++ subs w5, w5, #96
++
++ // v0, v1
++
++ shrn v18.4h, v0.4s, #14
++ xtn v16.4h, v0.4s
++ shrn v17.4h, v0.4s, #10
++
++ shrn2 v18.8h, v1.4s, #14
++ xtn2 v16.8h, v1.4s
++ shrn2 v17.8h, v1.4s, #10
++
++ ushr v18.8h, v18.8h, #6
++ bic v16.8h, #0xfc, lsl #8
++ bic v17.8h, #0xfc, lsl #8
++
++ // v2, v3
++
++ shrn v21.4h, v2.4s, #14
++ xtn v19.4h, v2.4s
++ shrn v20.4h, v2.4s, #10
++
++ shrn2 v21.8h, v3.4s, #14
++ xtn2 v19.8h, v3.4s
++ shrn2 v20.8h, v3.4s, #10
++
++ ushr v21.8h, v21.8h, #6
++ bic v19.8h, #0xfc, lsl #8
++ bic v20.8h, #0xfc, lsl #8
++
++ // v4, v5
++
++ shrn v24.4h, v4.4s, #14
++ xtn v22.4h, v4.4s
++ shrn v23.4h, v4.4s, #10
++
++ shrn2 v24.8h, v5.4s, #14
++ xtn2 v22.8h, v5.4s
++ shrn2 v23.8h, v5.4s, #10
++
++ ushr v24.8h, v24.8h, #6
++ bic v22.8h, #0xfc, lsl #8
++ bic v23.8h, #0xfc, lsl #8
++
++ // v6, v7
++
++ shrn v27.4h, v6.4s, #14
++ xtn v25.4h, v6.4s
++ shrn v26.4h, v6.4s, #10
++
++ shrn2 v27.8h, v7.4s, #14
++ xtn2 v25.8h, v7.4s
++ shrn2 v26.8h, v7.4s, #10
++
++ ushr v27.8h, v27.8h, #6
++ bic v25.8h, #0xfc, lsl #8
++ bic v26.8h, #0xfc, lsl #8
++
++ blt 2f
++
++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48
++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48
++ st3 {v22.8h, v23.8h, v24.8h}, [x0], #48
++ st3 {v25.8h, v26.8h, v27.8h}, [x0], #48
++
++ bne 1b
++
++11:
++ subs w6, w6, #1
++ add x0, x0, w1, uxtw
++ add x8, x8, #128
++ bne 10b
++
++ ret
++
++// Partial final write
++2:
++ cmp w5, #48-96
++ blt 1f
++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48
++ st3 {v19.8h, v20.8h, v21.8h}, [x0], #48
++ beq 11b
++ mov v16.16b, v22.16b
++ mov v17.16b, v23.16b
++ sub w5, w5, #48
++ mov v18.16b, v24.16b
++ mov v19.16b, v25.16b
++ mov v20.16b, v26.16b
++ mov v21.16b, v27.16b
++1:
++ cmp w5, #24-96
++ blt 1f
++ st3 {v16.8h, v17.8h, v18.8h}, [x0], #48
++ beq 11b
++ mov v16.16b, v19.16b
++ mov v17.16b, v20.16b
++ sub w5, w5, #24
++ mov v18.16b, v21.16b
++1:
++ cmp w5, #12-96
++ blt 1f
++ st3 {v16.4h, v17.4h, v18.4h}, [x0], #24
++ beq 11b
++ mov v16.2d[0], v16.2d[1]
++ sub w5, w5, #12
++ mov v17.2d[0], v17.2d[1]
++ mov v18.2d[0], v18.2d[1]
++1:
++ cmp w5, #6-96
++ blt 1f
++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6
++ st3 {v16.h, v17.h, v18.h}[1], [x0], #6
++ beq 11b
++ mov v16.2s[0], v16.2s[1]
++ sub w5, w5, #6
++ mov v17.2s[0], v17.2s[1]
++ mov v18.2s[0], v18.2s[1]
++1:
++ cmp w5, #3-96
++ blt 1f
++ st3 {v16.h, v17.h, v18.h}[0], [x0], #6
++ beq 11b
++ mov v16.4h[0], v16.4h[1]
++ sub w5, w5, #3
++ mov v17.4h[0], v17.4h[1]
++1:
++ cmp w5, #2-96
++ blt 1f
++ st2 {v16.h, v17.h}[0], [x0], #4
++ b 11b
++1:
++ st1 {v16.h}[0], [x0], #2
++ b 11b
++
++endfunc
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++// uint8_t * dest, : x0
++// unsigned int dst_stride, : w1
++// const uint8_t * src, : x2
++// unsigned int src_stride1, : w3, always 128
++// unsigned int src_stride2, : w4
++// unsigned int _x, : w5
++// unsigned int y, : w6
++// unsigned int _w, : w7
++// unsigned int h); : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++ lsl w4, w4, #7
++ sub w4, w4, #64
++ sub w1, w1, w7
++ uxtw x6, w6
++ add x8, x2, x6, lsl #7
++ ldr w6, [sp, #0]
++
++10:
++ mov x2, x8
++ mov w5, w7
++1:
++ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++ subs w5, w5, #96
++
++ // v0, v1
++
++ shrn v18.4h, v0.4s, #16
++ xtn v16.4h, v0.4s
++ shrn v17.4h, v0.4s, #12
++
++ shrn2 v18.8h, v1.4s, #16
++ xtn2 v16.8h, v1.4s
++ shrn2 v17.8h, v1.4s, #12
++
++ shrn v18.8b, v18.8h, #6
++ shrn v16.8b, v16.8h, #2
++ xtn v17.8b, v17.8h
++
++ // v2, v3
++
++ shrn v21.4h, v2.4s, #16
++ xtn v19.4h, v2.4s
++ shrn v20.4h, v2.4s, #12
++
++ shrn2 v21.8h, v3.4s, #16
++ xtn2 v19.8h, v3.4s
++ shrn2 v20.8h, v3.4s, #12
++
++ shrn2 v18.16b, v21.8h, #6
++ shrn2 v16.16b, v19.8h, #2
++ xtn2 v17.16b, v20.8h
++
++ // v4, v5
++
++ shrn v24.4h, v4.4s, #16
++ xtn v22.4h, v4.4s
++ shrn v23.4h, v4.4s, #12
++
++ shrn2 v24.8h, v5.4s, #16
++ xtn2 v22.8h, v5.4s
++ shrn2 v23.8h, v5.4s, #12
++
++ shrn v21.8b, v24.8h, #6
++ shrn v19.8b, v22.8h, #2
++ xtn v20.8b, v23.8h
++
++ // v6, v7
++
++ shrn v27.4h, v6.4s, #16
++ xtn v25.4h, v6.4s
++ shrn v26.4h, v6.4s, #12
++
++ shrn2 v27.8h, v7.4s, #16
++ xtn2 v25.8h, v7.4s
++ shrn2 v26.8h, v7.4s, #12
++
++ shrn2 v21.16b, v27.8h, #6
++ shrn2 v19.16b, v25.8h, #2
++ xtn2 v20.16b, v26.8h
++
++ blt 2f
++
++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48
++ st3 {v19.16b, v20.16b, v21.16b}, [x0], #48
++
++ bne 1b
++
++11:
++ subs w6, w6, #1
++ add x0, x0, w1, uxtw
++ add x8, x8, #128
++ bne 10b
++
++ ret
++
++// Partial final write
++2:
++ cmp w5, #48-96
++ blt 1f
++ st3 {v16.16b, v17.16b, v18.16b}, [x0], #48
++ beq 11b
++ mov v16.16b, v22.16b
++ mov v17.16b, v23.16b
++ sub w5, w5, #48
++ mov v18.16b, v24.16b
++1:
++ cmp w5, #24-96
++ blt 1f
++ st3 {v16.8b, v17.8b, v18.8b}, [x0], #24
++ beq 11b
++ mov v16.2d[0], v16.2d[1]
++ sub w5, w5, #24
++ mov v17.2d[0], v17.2d[1]
++ mov v18.2d[0], v18.2d[1]
++1:
++ cmp w5, #12-96
++ blt 1f
++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3
++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3
++ st3 {v16.b, v17.b, v18.b}[2], [x0], #3
++ st3 {v16.b, v17.b, v18.b}[3], [x0], #3
++ beq 11b
++ mov v16.2s[0], v16.2s[1]
++ sub w5, w5, #12
++ mov v17.2s[0], v17.2s[1]
++ mov v18.2s[0], v18.2s[1]
++1:
++ cmp w5, #6-96
++ blt 1f
++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3
++ st3 {v16.b, v17.b, v18.b}[1], [x0], #3
++ beq 11b
++ mov v16.4h[0], v16.4h[1]
++ sub w5, w5, #6
++ mov v17.4h[0], v17.4h[1]
++ mov v18.4h[0], v18.4h[1]
++1:
++ cmp w5, #3-96
++ blt 1f
++ st3 {v16.b, v17.b, v18.b}[0], [x0], #3
++ beq 11b
++ mov v16.8b[0], v16.8b[1]
++ sub w5, w5, #3
++ mov v17.8b[0], v17.8b[1]
++1:
++ cmp w5, #2-96
++ blt 1f
++ st2 {v16.b, v17.b}[0], [x0], #2
++ b 11b
++1:
++ st1 {v16.b}[0], [x0], #1
++ b 11b
++
++endfunc
++
+--- /dev/null
++++ b/libavutil/aarch64/rpi_sand_neon.h
+@@ -0,0 +1,59 @@
++/*
++Copyright (c) 2021 Michael Eiler
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: Michael Eiler <eiler.mike@gmail.com>
++*/
++
++#pragma once
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
++ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
++ unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
++ uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
++ unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++#ifdef __cplusplus
++}
++#endif
++
+--- a/libavutil/arm/Makefile
++++ b/libavutil/arm/Makefile
+@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o
+
+ NEON-OBJS += arm/float_dsp_init_neon.o \
+ arm/float_dsp_neon.o \
++ arm/rpi_sand_neon.o \
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.S
+@@ -0,0 +1,925 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "libavutil/arm/asm.S"
++
++
++@ General notes:
++@ Having done some timing on this in sand8->y8 (Pi4)
++@ vst1 (680fps) is a bit faster than vstm (660fps)
++@ vldm (680fps) is noticably faster than vld1 (480fps)
++@ (or it might be that a mix is what is required)
++@
++@ At least on a Pi4 it is no more expensive to have a single auto-inc register
++@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
++@ the latter was better)
++@
++@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
++@ the memory is uncached.
++@ As these are Sand -> planar we can assume that src is going to be aligned but
++@ it is possible that dest isn't (converting to .yuv or other packed format).
++@ Luckily vst1 is faster than vstm :-) so all is well
++@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
++@ .8 stores would let us do non-word aligned stores into uncached but it
++@ probably isn't worth it.
++
++
++
++
++@ void ff_rpi_sand128b_stripe_to_8_10(
++@ uint8_t * dest, // [r0]
++@ const uint8_t * src1, // [r1]
++@ const uint8_t * src2, // [r2]
++@ unsigned int lines); // [r3]
++
++.macro stripe2_to_8, bit_depth
++ vpush {q4-q7}
++1:
++ vldm r1!, {q0-q7}
++ subs r3, #1
++ vldm r2!, {q8-q15}
++ vqrshrn.u16 d0, q0, #\bit_depth - 8
++ vqrshrn.u16 d1, q1, #\bit_depth - 8
++ vqrshrn.u16 d2, q2, #\bit_depth - 8
++ vqrshrn.u16 d3, q3, #\bit_depth - 8
++ vqrshrn.u16 d4, q4, #\bit_depth - 8
++ vqrshrn.u16 d5, q5, #\bit_depth - 8
++ vqrshrn.u16 d6, q6, #\bit_depth - 8
++ vqrshrn.u16 d7, q7, #\bit_depth - 8
++ vqrshrn.u16 d8, q8, #\bit_depth - 8
++ vqrshrn.u16 d9, q9, #\bit_depth - 8
++ vqrshrn.u16 d10, q10, #\bit_depth - 8
++ vqrshrn.u16 d11, q11, #\bit_depth - 8
++ vqrshrn.u16 d12, q12, #\bit_depth - 8
++ vqrshrn.u16 d13, q13, #\bit_depth - 8
++ vqrshrn.u16 d14, q14, #\bit_depth - 8
++ vqrshrn.u16 d15, q15, #\bit_depth - 8
++ vstm r0!, {q0-q7}
++ bne 1b
++ vpop {q4-q7}
++ bx lr
++.endm
++
++function ff_rpi_sand128b_stripe_to_8_10, export=1
++ stripe2_to_8 10
++endfunc
++
++@ void ff_rpi_sand8_lines_to_planar_y8(
++@ uint8_t * dest, // [r0]
++@ unsigned int dst_stride, // [r1]
++@ const uint8_t * src, // [r2]
++@ unsigned int src_stride1, // [r3] Ignored - assumed 128
++@ unsigned int src_stride2, // [sp, #0] -> r3
++@ unsigned int _x, // [sp, #4] Ignored - 0
++@ unsigned int y, // [sp, #8] (r7 in prefix)
++@ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++@ unsigned int h); // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand8_lines_to_planar_y8, export=1
++ push {r4-r8, lr} @ +24 L
++ ldr r3, [sp, #24]
++ ldr r6, [sp, #36]
++ ldr r7, [sp, #32] @ y
++ lsl r3, #7
++ sub r1, r6
++ add r8, r2, r7, lsl #7
++ ldr r7, [sp, #40]
++
++10:
++ mov r2, r8
++ add r4, r0, #24
++ mov r5, r6
++ mov lr, #0
++1:
++ vldm r2, {q8-q15}
++ add r2, r3
++ subs r5, #128
++ blt 2f
++ vst1.8 {d16, d17, d18, d19}, [r0]!
++ vst1.8 {d20, d21, d22, d23}, [r0]!
++ vst1.8 {d24, d25, d26, d27}, [r0]!
++ vst1.8 {d28, d29, d30, d31}, [r0]!
++ bne 1b
++11:
++ subs r7, #1
++ add r0, r1
++ add r8, #128
++ bne 10b
++
++ pop {r4-r8, pc}
++
++@ Partial final write
++2:
++ cmp r5, #64-128
++ blt 1f
++ vst1.8 {d16, d17, d18, d19}, [r0]!
++ vst1.8 {d20, d21, d22, d23}, [r0]!
++ beq 11b
++ vmov q8, q12
++ vmov q9, q13
++ sub r5, #64
++ vmov q10, q14
++ vmov q11, q15
++1:
++ cmp r5, #32-128
++ blt 1f
++ vst1.8 {d16, d17, d18, d19}, [r0]!
++ beq 11b
++ vmov q8, q10
++ sub r5, #32
++ vmov q9, q11
++1:
++ cmp r5, #16-128
++ blt 1f
++ vst1.8 {d16, d17}, [r0]!
++ beq 11b
++ sub r5, #16
++ vmov q8, q9
++1:
++ cmp r5, #8-128
++ blt 1f
++ vst1.8 {d16}, [r0]!
++ beq 11b
++ sub r5, #8
++ vmov d16, d17
++1:
++ cmp r5, #4-128
++ blt 1f
++ vst1.32 {d16[0]}, [r0]!
++ beq 11b
++ sub r5, #4
++ vshr.u64 d16, #32
++1:
++ cmp r5, #2-128
++ blt 1f
++ vst1.16 {d16[0]}, [r0]!
++ beq 11b
++ vst1.8 {d16[2]}, [r0]!
++ b 11b
++1:
++ vst1.8 {d16[0]}, [r0]!
++ b 11b
++endfunc
++
++@ void ff_rpi_sand8_lines_to_planar_c8(
++@ uint8_t * dst_u, // [r0]
++@ unsigned int dst_stride_u, // [r1]
++@ uint8_t * dst_v, // [r2]
++@ unsigned int dst_stride_v, // [r3]
++@ const uint8_t * src, // [sp, #0] -> r4, r5
++@ unsigned int stride1, // [sp, #4] 128
++@ unsigned int stride2, // [sp, #8] -> r8
++@ unsigned int _x, // [sp, #12] 0
++@ unsigned int y, // [sp, #16] (r7 in prefix)
++@ unsigned int _w, // [sp, #20] -> r12, r6
++@ unsigned int h); // [sp, #24] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand8_lines_to_planar_c8, export=1
++ push {r4-r8, lr} @ +24
++
++ ldr r5, [sp, #24]
++ ldr r8, [sp, #32]
++ ldr r7, [sp, #40]
++ ldr r6, [sp, #44]
++ lsl r8, #7
++ add r5, r5, r7, lsl #7
++ sub r1, r1, r6
++ sub r3, r3, r6
++ ldr r7, [sp, #48]
++ vpush {q4-q7}
++
++10:
++ mov r4, r5
++ mov r12, r6
++1:
++ subs r12, #64
++ vldm r4, {q0-q7}
++ add r4, r8
++ it gt
++ vldmgt r4, {q8-q15}
++ add r4, r8
++
++ vuzp.8 q0, q1
++ vuzp.8 q2, q3
++ vuzp.8 q4, q5
++ vuzp.8 q6, q7
++
++ vuzp.8 q8, q9
++ vuzp.8 q10, q11
++ vuzp.8 q12, q13
++ vuzp.8 q14, q15
++ subs r12, #64
++
++ @ Rearrange regs so we can use vst1 with 4 regs
++ vswp q1, q2
++ vswp q5, q6
++ vswp q9, q10
++ vswp q13, q14
++ blt 2f
++
++ vst1.8 {d0, d1, d2, d3 }, [r0]!
++ vst1.8 {d8, d9, d10, d11}, [r0]!
++ vst1.8 {d16, d17, d18, d19}, [r0]!
++ vst1.8 {d24, d25, d26, d27}, [r0]!
++
++ vst1.8 {d4, d5, d6, d7 }, [r2]!
++ vst1.8 {d12, d13, d14, d15}, [r2]!
++ vst1.8 {d20, d21, d22, d23}, [r2]!
++ vst1.8 {d28, d29, d30, d31}, [r2]!
++ bne 1b
++11:
++ subs r7, #1
++ add r5, #128
++ add r0, r1
++ add r2, r3
++ bne 10b
++ vpop {q4-q7}
++ pop {r4-r8,pc}
++
++2:
++ cmp r12, #64-128
++ blt 1f
++ vst1.8 {d0, d1, d2, d3 }, [r0]!
++ vst1.8 {d8, d9, d10, d11}, [r0]!
++ vst1.8 {d4, d5, d6, d7 }, [r2]!
++ vst1.8 {d12, d13, d14, d15}, [r2]!
++ beq 11b
++ sub r12, #64
++ vmov q0, q8
++ vmov q1, q9
++ vmov q2, q10
++ vmov q3, q11
++ vmov q4, q12
++ vmov q5, q13
++ vmov q6, q14
++ vmov q7, q15
++1:
++ cmp r12, #32-128
++ blt 1f
++ vst1.8 {d0, d1, d2, d3 }, [r0]!
++ vst1.8 {d4, d5, d6, d7 }, [r2]!
++ beq 11b
++ sub r12, #32
++ vmov q0, q4
++ vmov q1, q5
++ vmov q2, q6
++ vmov q3, q7
++1:
++ cmp r12, #16-128
++ blt 1f
++ vst1.8 {d0, d1 }, [r0]!
++ vst1.8 {d4, d5 }, [r2]!
++ beq 11b
++ sub r12, #16
++ vmov q0, q1
++ vmov q2, q3
++1:
++ cmp r12, #8-128
++ blt 1f
++ vst1.8 {d0}, [r0]!
++ vst1.8 {d4}, [r2]!
++ beq 11b
++ sub r12, #8
++ vmov d0, d1
++ vmov d4, d5
++1:
++ cmp r12, #4-128
++ blt 1f
++ vst1.32 {d0[0]}, [r0]!
++ vst1.32 {d4[0]}, [r2]!
++ beq 11b
++ sub r12, #4
++ vmov s0, s1
++ vmov s8, s9
++1:
++ cmp r12, #2-128
++ blt 1f
++ vst1.16 {d0[0]}, [r0]!
++ vst1.16 {d4[0]}, [r2]!
++ beq 11b
++ vst1.8 {d0[2]}, [r0]!
++ vst1.8 {d4[2]}, [r2]!
++ b 11b
++1:
++ vst1.8 {d0[0]}, [r0]!
++ vst1.8 {d4[0]}, [r2]!
++ b 11b
++endfunc
++
++
++
++@ void ff_rpi_sand30_lines_to_planar_y16(
++@ uint8_t * dest, // [r0]
++@ unsigned int dst_stride, // [r1]
++@ const uint8_t * src, // [r2]
++@ unsigned int src_stride1, // [r3] Ignored - assumed 128
++@ unsigned int src_stride2, // [sp, #0] -> r3
++@ unsigned int _x, // [sp, #4] Ignored - 0
++@ unsigned int y, // [sp, #8] (r7 in prefix)
++@ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++@ unsigned int h); // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++ push {r4-r8, lr} @ +24
++ ldr r3, [sp, #24]
++ ldr r6, [sp, #36]
++ ldr r7, [sp, #32] @ y
++ mov r12, #48
++ sub r3, #1
++ lsl r3, #7
++ sub r1, r1, r6, lsl #1
++ add r8, r2, r7, lsl #7
++ ldr r7, [sp, #40]
++
++10:
++ mov r2, r8
++ add r4, r0, #24
++ mov r5, r6
++ mov lr, #0
++1:
++ vldm r2!, {q10-q13}
++ add lr, #64
++
++ vshrn.u32 d4 , q10, #14 @ Cannot vshrn.u32 #20!
++ ands lr, #127
++ vshrn.u32 d2, q10, #10
++ vmovn.u32 d0, q10
++
++ vshrn.u32 d5, q11, #14
++ it eq
++ addeq r2, r3
++ vshrn.u32 d3, q11, #10
++ vmovn.u32 d1, q11
++
++ subs r5, #48
++ vshr.u16 q2, #6
++ vbic.u16 q0, #0xfc00
++ vbic.u16 q1, #0xfc00
++
++ vshrn.u32 d20, q12, #14
++ vshrn.u32 d18, q12, #10
++ vmovn.u32 d16, q12
++
++ vshrn.u32 d21, q13, #14
++ vshrn.u32 d19, q13, #10
++ vmovn.u32 d17, q13
++
++ vshr.u16 q10, #6
++ vbic.u16 q8, #0xfc00
++ vbic.u16 q9 , #0xfc00
++ blt 2f
++
++ vst3.16 {d0, d2, d4}, [r0], r12
++ vst3.16 {d1, d3, d5}, [r4], r12
++ vst3.16 {d16, d18, d20}, [r0], r12
++ vst3.16 {d17, d19, d21}, [r4], r12
++
++ bne 1b
++
++11:
++ subs r7, #1
++ add r0, r1
++ add r8, #128
++ bne 10b
++
++ pop {r4-r8, pc}
++
++@ Partial final write
++2:
++ cmp r5, #24-48
++ blt 1f
++ vst3.16 {d0, d2, d4}, [r0], r12
++ vst3.16 {d1, d3, d5}, [r4]
++ beq 11b
++ vmov q0, q8
++ sub r5, #24
++ vmov q1, q9
++ vmov q2, q10
++1:
++ cmp r5, #12-48
++ blt 1f
++ vst3.16 {d0, d2, d4}, [r0]!
++ beq 11b
++ vmov d0, d1
++ sub r5, #12
++ vmov d2, d3
++ vmov d4, d5
++1:
++ cmp r5, #6-48
++ add r4, r0, #6 @ avoid [r0]! on sequential instructions
++ blt 1f
++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]
++ vst3.16 {d0[1], d2[1], d4[1]}, [r4]
++ add r0, #12
++ beq 11b
++ vmov s0, s1
++ sub r5, #6
++ vmov s4, s5
++ vmov s8, s9
++1:
++ cmp r5, #3-48
++ blt 1f
++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]!
++ beq 11b
++ sub r5, #3
++ vshr.u32 d0, #16
++ vshr.u32 d2, #16
++1:
++ cmp r5, #2-48
++ blt 1f
++ vst2.16 {d0[0], d2[0]}, [r0]!
++ b 11b
++1:
++ vst1.16 {d0[0]}, [r0]!
++ b 11b
++
++endfunc
++
++
++@ void ff_rpi_sand30_lines_to_planar_c16(
++@ uint8_t * dst_u, // [r0]
++@ unsigned int dst_stride_u, // [r1]
++@ uint8_t * dst_v, // [r2]
++@ unsigned int dst_stride_v, // [r3]
++@ const uint8_t * src, // [sp, #0] -> r4, r5
++@ unsigned int stride1, // [sp, #4] 128
++@ unsigned int stride2, // [sp, #8] -> r8
++@ unsigned int _x, // [sp, #12] 0
++@ unsigned int y, // [sp, #16] (r7 in prefix)
++@ unsigned int _w, // [sp, #20] -> r6, r9
++@ unsigned int h); // [sp, #24] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++ push {r4-r10, lr} @ +32
++ ldr r5, [sp, #32]
++ ldr r8, [sp, #40]
++ ldr r7, [sp, #48]
++ ldr r9, [sp, #52]
++ mov r12, #48
++ sub r8, #1
++ lsl r8, #7
++ add r5, r5, r7, lsl #7
++ sub r1, r1, r9, lsl #1
++ sub r3, r3, r9, lsl #1
++ ldr r7, [sp, #56]
++10:
++ mov lr, #0
++ mov r4, r5
++ mov r6, r9
++1:
++ vldm r4!, {q0-q3}
++ add lr, #64
++
++ @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
++ vshrn.u32 d20, q0, #14
++ vmovn.u32 d18, q0
++ vshrn.u32 d0, q0, #10
++ ands lr, #127
++
++ vshrn.u32 d21, q1, #14
++ vmovn.u32 d19, q1
++ vshrn.u32 d1, q1, #10
++
++ vshrn.u32 d22, q2, #10
++ vmovn.u32 d2, q2
++ vshrn.u32 d4, q2, #14
++
++ add r10, r0, #24
++ vshrn.u32 d23, q3, #10
++ vmovn.u32 d3, q3
++ vshrn.u32 d5, q3, #14
++
++ it eq
++ addeq r4, r8
++ vuzp.16 q0, q11
++ vuzp.16 q9, q1
++ vuzp.16 q10, q2
++
++ @ q0 V0, V3,..
++ @ q9 U0, U3...
++ @ q10 U1, U4...
++ @ q11 U2, U5,..
++ @ q1 V1, V4,
++ @ q2 V2, V5,..
++
++ subs r6, #24
++ vbic.u16 q11, #0xfc00
++ vbic.u16 q9, #0xfc00
++ vshr.u16 q10, #6
++ vshr.u16 q2, #6
++ vbic.u16 q0, #0xfc00
++ vbic.u16 q1, #0xfc00
++
++ blt 2f
++
++ vst3.16 {d18, d20, d22}, [r0], r12
++ vst3.16 {d19, d21, d23}, [r10]
++ add r10, r2, #24
++ vst3.16 {d0, d2, d4}, [r2], r12
++ vst3.16 {d1, d3, d5}, [r10]
++
++ bne 1b
++
++11:
++ subs r7, #1
++ add r5, #128
++ add r0, r1
++ add r2, r3
++ bne 10b
++
++ pop {r4-r10, pc}
++
++@ Partial final write
++2:
++ cmp r6, #-12
++ blt 1f
++ vst3.16 {d18, d20, d22}, [r0]!
++ vst3.16 {d0, d2, d4}, [r2]!
++ beq 11b
++ vmov d18, d19
++ vmov d20, d21
++ vmov d22, d23
++ sub r6, #12
++ vmov d0, d1
++ vmov d2, d3
++ vmov d4, d5
++1:
++ cmp r6, #-18
++ @ Rezip here as it makes the remaining tail handling easier
++ vzip.16 d0, d18
++ vzip.16 d2, d20
++ vzip.16 d4, d22
++ blt 1f
++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]!
++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]!
++ vst3.16 {d0[3], d2[3], d4[3]}, [r0]!
++ vst3.16 {d0[2], d2[2], d4[2]}, [r2]!
++ beq 11b
++ vmov d0, d18
++ vmov d2, d20
++ sub r6, #6
++ vmov d4, d22
++1:
++ cmp r6, #-21
++ blt 1f
++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]!
++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]!
++ beq 11b
++ vmov s4, s5
++ sub r6, #3
++ vmov s0, s1
++1:
++ cmp r6, #-22
++ blt 1f
++ vst2.16 {d0[1], d2[1]}, [r0]!
++ vst2.16 {d0[0], d2[0]}, [r2]!
++ b 11b
++1:
++ vst1.16 {d0[1]}, [r0]!
++ vst1.16 {d0[0]}, [r2]!
++ b 11b
++
++endfunc
++
++@ void ff_rpi_sand30_lines_to_planar_p010(
++@ uint8_t * dest, // [r0]
++@ unsigned int dst_stride, // [r1]
++@ const uint8_t * src, // [r2]
++@ unsigned int src_stride1, // [r3] Ignored - assumed 128
++@ unsigned int src_stride2, // [sp, #0] -> r3
++@ unsigned int _x, // [sp, #4] Ignored - 0
++@ unsigned int y, // [sp, #8] (r7 in prefix)
++@ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++@ unsigned int h); // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_p010, export=1
++ push {r4-r8, lr} @ +24
++ ldr r3, [sp, #24]
++ ldr r6, [sp, #36]
++ ldr r7, [sp, #32] @ y
++ mov r12, #48
++ vmov.u16 q15, #0xffc0
++ sub r3, #1
++ lsl r3, #7
++ sub r1, r1, r6, lsl #1
++ add r8, r2, r7, lsl #7
++ ldr r7, [sp, #40]
++
++10:
++ mov r2, r8
++ add r4, r0, #24
++ mov r5, r6
++ mov lr, #0
++1:
++ vldm r2!, {q10-q13}
++ add lr, #64
++
++ vshl.u32 q14, q10, #6
++ ands lr, #127
++ vshrn.u32 d4, q10, #14
++ vshrn.u32 d2, q10, #4
++ vmovn.u32 d0, q14
++
++ vshl.u32 q14, q11, #6
++ it eq
++ addeq r2, r3
++ vshrn.u32 d5, q11, #14
++ vshrn.u32 d3, q11, #4
++ vmovn.u32 d1, q14
++
++ subs r5, #48
++ vand q2, q15
++ vand q1, q15
++ vand q0, q15
++
++ vshl.u32 q14, q12, #6
++ vshrn.u32 d20, q12, #14
++ vshrn.u32 d18, q12, #4
++ vmovn.u32 d16, q14
++
++ vshl.u32 q14, q13, #6
++ vshrn.u32 d21, q13, #14
++ vshrn.u32 d19, q13, #4
++ vmovn.u32 d17, q14
++
++ vand q10, q15
++ vand q9, q15
++ vand q8, q15
++ blt 2f
++
++ vst3.16 {d0, d2, d4}, [r0], r12
++ vst3.16 {d1, d3, d5}, [r4], r12
++ vst3.16 {d16, d18, d20}, [r0], r12
++ vst3.16 {d17, d19, d21}, [r4], r12
++
++ bne 1b
++
++11:
++ subs r7, #1
++ add r0, r1
++ add r8, #128
++ bne 10b
++
++ pop {r4-r8, pc}
++
++@ Partial final write
++2:
++ cmp r5, #24-48
++ blt 1f
++ vst3.16 {d0, d2, d4}, [r0], r12
++ vst3.16 {d1, d3, d5}, [r4]
++ beq 11b
++ vmov q0, q8
++ sub r5, #24
++ vmov q1, q9
++ vmov q2, q10
++1:
++ cmp r5, #12-48
++ blt 1f
++ vst3.16 {d0, d2, d4}, [r0]!
++ beq 11b
++ vmov d0, d1
++ sub r5, #12
++ vmov d2, d3
++ vmov d4, d5
++1:
++ cmp r5, #6-48
++ add r4, r0, #6 @ avoid [r0]! on sequential instructions
++ blt 1f
++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]
++ vst3.16 {d0[1], d2[1], d4[1]}, [r4]
++ add r0, #12
++ beq 11b
++ vmov s0, s1
++ sub r5, #6
++ vmov s4, s5
++ vmov s8, s9
++1:
++ cmp r5, #3-48
++ blt 1f
++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]!
++ beq 11b
++ sub r5, #3
++ vshr.u32 d0, #16
++ vshr.u32 d2, #16
++1:
++ cmp r5, #2-48
++ blt 1f
++ vst2.16 {d0[0], d2[0]}, [r0]!
++ b 11b
++1:
++ vst1.16 {d0[0]}, [r0]!
++ b 11b
++
++endfunc
++
++
++@ void ff_rpi_sand30_lines_to_planar_y8(
++@ uint8_t * dest, // [r0]
++@ unsigned int dst_stride, // [r1]
++@ const uint8_t * src, // [r2]
++@ unsigned int src_stride1, // [r3] Ignored - assumed 128
++@ unsigned int src_stride2, // [sp, #0] -> r3
++@ unsigned int _x, // [sp, #4] Ignored - 0
++@ unsigned int y, // [sp, #8] (r7 in prefix)
++@ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++@ unsigned int h); // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++ push {r4-r8, lr} @ +24
++ ldr r3, [sp, #24]
++ ldr r6, [sp, #36]
++ ldr r7, [sp, #32] @ y
++ mov r12, #48
++ lsl r3, #7
++ sub r1, r1, r6
++ add r8, r2, r7, lsl #7
++ ldr r7, [sp, #40]
++
++10:
++ mov r2, r8
++ add r4, r0, #24
++ mov r5, r6
++1:
++ vldm r2, {q8-q15}
++
++ subs r5, #96
++
++ vmovn.u32 d0, q8
++ vshrn.u32 d2, q8, #12
++ vshrn.u32 d4, q8, #16 @ Cannot vshrn.u32 #20!
++
++ add r2, r3
++
++ vmovn.u32 d1, q9
++ vshrn.u32 d3, q9, #12
++ vshrn.u32 d5, q9, #16
++
++ pld [r2, #0]
++
++ vshrn.u16 d0, q0, #2
++ vmovn.u16 d1, q1
++ vshrn.u16 d2, q2, #6
++
++ vmovn.u32 d16, q10
++ vshrn.u32 d18, q10, #12
++ vshrn.u32 d20, q10, #16
++
++ vmovn.u32 d17, q11
++ vshrn.u32 d19, q11, #12
++ vshrn.u32 d21, q11, #16
++
++ pld [r2, #64]
++
++ vshrn.u16 d4, q8, #2
++ vmovn.u16 d5, q9
++ vshrn.u16 d6, q10, #6
++
++ vmovn.u32 d16, q12
++ vshrn.u32 d18, q12, #12
++ vshrn.u32 d20, q12, #16
++
++ vmovn.u32 d17, q13
++ vshrn.u32 d19, q13, #12
++ vshrn.u32 d21, q13, #16
++
++ vshrn.u16 d16, q8, #2
++ vmovn.u16 d17, q9
++ vshrn.u16 d18, q10, #6
++
++ vmovn.u32 d20, q14
++ vshrn.u32 d22, q14, #12
++ vshrn.u32 d24, q14, #16
++
++ vmovn.u32 d21, q15
++ vshrn.u32 d23, q15, #12
++ vshrn.u32 d25, q15, #16
++
++ vshrn.u16 d20, q10, #2
++ vmovn.u16 d21, q11
++ vshrn.u16 d22, q12, #6
++
++ blt 2f
++
++ vst3.8 {d0, d1, d2}, [r0], r12
++ vst3.8 {d4, d5, d6}, [r4], r12
++ vst3.8 {d16, d17, d18}, [r0], r12
++ vst3.8 {d20, d21, d22}, [r4], r12
++
++ bne 1b
++
++11:
++ subs r7, #1
++ add r0, r1
++ add r8, #128
++ bne 10b
++
++ pop {r4-r8, pc}
++
++@ Partial final write
++2:
++ cmp r5, #48-96
++ blt 1f
++ vst3.8 {d0, d1, d2}, [r0], r12
++ vst3.8 {d4, d5, d6}, [r4], r12
++ beq 11b
++ vmov q0, q8
++ vmov q2, q10
++ sub r5, #48
++ vmov d2, d18
++ vmov d6, d22
++1:
++ cmp r5, #24-96
++ blt 1f
++ vst3.8 {d0, d1, d2}, [r0]!
++ beq 11b
++ vmov q0, q2
++ sub r5, #24
++ vmov d2, d6
++1:
++ cmp r5, #12-96
++ blt 1f
++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]!
++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]!
++ vst3.8 {d0[2], d1[2], d2[2]}, [r0]!
++ vst3.8 {d0[3], d1[3], d2[3]}, [r0]!
++ beq 11b
++ vmov s0, s1
++ sub r5, #12
++ vmov s2, s3
++ vmov s4, s5
++1:
++ cmp r5, #6-96
++ blt 1f
++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]!
++ vst3.8 {d0[1], d1[1], d2[1]}, [r0]!
++ add r0, #12
++ beq 11b
++ vshr.u32 d0, #16
++ sub r5, #6
++ vshr.u32 d1, #16
++ vshr.u32 d2, #16
++1:
++ cmp r5, #3-96
++ blt 1f
++ vst3.8 {d0[0], d1[0], d2[0]}, [r0]!
++ beq 11b
++ sub r5, #3
++ vshr.u32 d0, #8
++ vshr.u32 d1, #8
++1:
++ cmp r5, #2-96
++ blt 1f
++ vst2.8 {d0[0], d1[0]}, [r0]!
++ b 11b
++1:
++ vst1.8 {d0[0]}, [r0]!
++ b 11b
++
++endfunc
++
++
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.h
+@@ -0,0 +1,110 @@
++/*
++Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef AVUTIL_ARM_SAND_NEON_H
++#define AVUTIL_ARM_SAND_NEON_H
++
++void ff_rpi_sand128b_stripe_to_8_10(
++ uint8_t * dest, // [r0]
++ const uint8_t * src1, // [r1]
++ const uint8_t * src2, // [r2]
++ unsigned int lines); // [r3]
++
++void ff_rpi_sand8_lines_to_planar_y8(
++ uint8_t * dest, // [r0]
++ unsigned int dst_stride, // [r1]
++ const uint8_t * src, // [r2]
++ unsigned int src_stride1, // [r3] Ignored - assumed 128
++ unsigned int src_stride2, // [sp, #0] -> r3
++ unsigned int _x, // [sp, #4] Ignored - 0
++ unsigned int y, // [sp, #8] (r7 in prefix)
++ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++ unsigned int h); // [sp, #16] -> r7
++
++void ff_rpi_sand8_lines_to_planar_c8(
++ uint8_t * dst_u, // [r0]
++ unsigned int dst_stride_u, // [r1]
++ uint8_t * dst_v, // [r2]
++ unsigned int dst_stride_v, // [r3]
++ const uint8_t * src, // [sp, #0] -> r4, r5
++ unsigned int stride1, // [sp, #4] 128
++ unsigned int stride2, // [sp, #8] -> r8
++ unsigned int _x, // [sp, #12] 0
++ unsigned int y, // [sp, #16] (r7 in prefix)
++ unsigned int _w, // [sp, #20] -> r12, r6
++ unsigned int h); // [sp, #24] -> r7
++
++void ff_rpi_sand30_lines_to_planar_y16(
++ uint8_t * dest, // [r0]
++ unsigned int dst_stride, // [r1]
++ const uint8_t * src, // [r2]
++ unsigned int src_stride1, // [r3] Ignored - assumed 128
++ unsigned int src_stride2, // [sp, #0] -> r3
++ unsigned int _x, // [sp, #4] Ignored - 0
++ unsigned int y, // [sp, #8] (r7 in prefix)
++ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++ unsigned int h); // [sp, #16] -> r7
++
++void ff_rpi_sand30_lines_to_planar_c16(
++ uint8_t * dst_u, // [r0]
++ unsigned int dst_stride_u, // [r1]
++ uint8_t * dst_v, // [r2]
++ unsigned int dst_stride_v, // [r3]
++ const uint8_t * src, // [sp, #0] -> r4, r5
++ unsigned int stride1, // [sp, #4] 128
++ unsigned int stride2, // [sp, #8] -> r8
++ unsigned int _x, // [sp, #12] 0
++ unsigned int y, // [sp, #16] (r7 in prefix)
++ unsigned int _w, // [sp, #20] -> r6, r9
++ unsigned int h); // [sp, #24] -> r7
++
++void ff_rpi_sand30_lines_to_planar_p010(
++ uint8_t * dest, // [r0]
++ unsigned int dst_stride, // [r1]
++ const uint8_t * src, // [r2]
++ unsigned int src_stride1, // [r3] Ignored - assumed 128
++ unsigned int src_stride2, // [sp, #0] -> r3
++ unsigned int _x, // [sp, #4] Ignored - 0
++ unsigned int y, // [sp, #8] (r7 in prefix)
++ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++ unsigned int h); // [sp, #16] -> r7
++
++void ff_rpi_sand30_lines_to_planar_y8(
++ uint8_t * dest, // [r0]
++ unsigned int dst_stride, // [r1]
++ const uint8_t * src, // [r2]
++ unsigned int src_stride1, // [r3] Ignored - assumed 128
++ unsigned int src_stride2, // [sp, #0] -> r3
++ unsigned int _x, // [sp, #4] Ignored - 0
++ unsigned int y, // [sp, #8] (r7 in prefix)
++ unsigned int _w, // [sp, #12] -> r6 (cur r5)
++ unsigned int h); // [sp, #16] -> r7
++
++#endif // AVUTIL_ARM_SAND_NEON_H
++
+--- a/libavutil/frame.c
++++ b/libavutil/frame.c
+@@ -16,6 +16,8 @@
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
++#include "config.h"
++
+ #include "channel_layout.h"
+ #include "avassert.h"
+ #include "buffer.h"
+@@ -26,6 +28,9 @@
+ #include "mem.h"
+ #include "samplefmt.h"
+ #include "hwcontext.h"
++#if CONFIG_SAND
++#include "rpi_sand_fns.h"
++#endif
+
+ #if FF_API_FRAME_GET_SET
+ MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
+@@ -902,6 +907,12 @@ int av_frame_apply_cropping(AVFrame *fra
+ (frame->crop_top + frame->crop_bottom) >= frame->height)
+ return AVERROR(ERANGE);
+
++#if CONFIG_SAND
++ // Sand cannot be cropped - do not try
++ if (av_rpi_is_sand_format(frame->format))
++ return 0;
++#endif
++
+ desc = av_pix_fmt_desc_get(frame->format);
+ if (!desc)
+ return AVERROR_BUG;
+--- a/libavutil/frame.h
++++ b/libavutil/frame.h
+@@ -968,6 +968,16 @@ int av_frame_apply_cropping(AVFrame *fra
+ */
+ const char *av_frame_side_data_name(enum AVFrameSideDataType type);
+
++
++static inline int av_frame_cropped_width(const AVFrame * const frame)
++{
++ return frame->width - (frame->crop_left + frame->crop_right);
++}
++static inline int av_frame_cropped_height(const AVFrame * const frame)
++{
++ return frame->height - (frame->crop_top + frame->crop_bottom);
++}
++
+ /**
+ * @}
+ */
+--- a/libavutil/hwcontext_drm.c
++++ b/libavutil/hwcontext_drm.c
+@@ -19,8 +19,10 @@
+ #include <fcntl.h>
+ #include <sys/mman.h>
+ #include <unistd.h>
++#include <sys/ioctl.h>
+
+ #include <drm.h>
++#include <libdrm/drm_fourcc.h>
+ #include <xf86drm.h>
+
+ #include "avassert.h"
+@@ -28,6 +30,11 @@
+ #include "hwcontext_drm.h"
+ #include "hwcontext_internal.h"
+ #include "imgutils.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include <linux/mman.h>
++#include <linux/dma-buf.h>
++#include <linux/dma-heap.h>
+
+
+ static void drm_device_free(AVHWDeviceContext *hwdev)
+@@ -43,6 +50,11 @@ static int drm_device_create(AVHWDeviceC
+ AVDRMDeviceContext *hwctx = hwdev->hwctx;
+ drmVersionPtr version;
+
++ if (device == NULL) {
++ hwctx->fd = -1;
++ return 0;
++ }
++
+ hwctx->fd = open(device, O_RDWR);
+ if (hwctx->fd < 0)
+ return AVERROR(errno);
+@@ -85,18 +97,37 @@ static int drm_get_buffer(AVHWFramesCont
+ typedef struct DRMMapping {
+ // Address and length of each mmap()ed region.
+ int nb_regions;
++ unsigned int dmaflags;
+ void *address[AV_DRM_MAX_PLANES];
+ size_t length[AV_DRM_MAX_PLANES];
++ int fds[AV_DRM_MAX_PLANES];
+ } DRMMapping;
+
++static int dmasync(const int fd, const unsigned int flags)
++{
++ struct dma_buf_sync sync = {
++ .flags = flags
++ };
++ while (ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
++ const int err = errno;
++ if (errno == EINTR)
++ continue;
++ av_log(NULL, AV_LOG_WARNING, "%s: ioctl failed: flags=%#x\n", __func__, flags);
++ return -err;
++ }
++ return 0;
++}
++
+ static void drm_unmap_frame(AVHWFramesContext *hwfc,
+ HWMapDescriptor *hwmap)
+ {
+ DRMMapping *map = hwmap->priv;
+ int i;
+
+- for (i = 0; i < map->nb_regions; i++)
++ for (i = 0; i < map->nb_regions; i++) {
+ munmap(map->address[i], map->length[i]);
++ dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags);
++ }
+
+ av_free(map);
+ }
+@@ -114,15 +145,28 @@ static int drm_map_frame(AVHWFramesConte
+ if (!map)
+ return AVERROR(ENOMEM);
+
++ for (i = 0; i < AV_DRM_MAX_PLANES; i++)
++ map->fds[i] = -1;
++
+ mmap_prot = 0;
+- if (flags & AV_HWFRAME_MAP_READ)
++ if (flags & AV_HWFRAME_MAP_READ) {
++ map->dmaflags |= DMA_BUF_SYNC_READ;
+ mmap_prot |= PROT_READ;
+- if (flags & AV_HWFRAME_MAP_WRITE)
++ }
++ if (flags & AV_HWFRAME_MAP_WRITE) {
++ map->dmaflags |= DMA_BUF_SYNC_WRITE;
+ mmap_prot |= PROT_WRITE;
++ }
++
++ if (dst->format == AV_PIX_FMT_NONE)
++ dst->format = hwfc->sw_format;
+
+ av_assert0(desc->nb_objects <= AV_DRM_MAX_PLANES);
+ for (i = 0; i < desc->nb_objects; i++) {
+- addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED,
++ dmasync(desc->objects[i].fd, DMA_BUF_SYNC_START | map->dmaflags);
++ map->fds[i] = desc->objects[i].fd;
++
++ addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED | MAP_POPULATE,
+ desc->objects[i].fd, 0);
+ if (addr == MAP_FAILED) {
+ err = AVERROR(errno);
+@@ -151,6 +195,23 @@ static int drm_map_frame(AVHWFramesConte
+
+ dst->width = src->width;
+ dst->height = src->height;
++ dst->crop_top = src->crop_top;
++ dst->crop_bottom = src->crop_bottom;
++ dst->crop_left = src->crop_left;
++ dst->crop_right = src->crop_right;
++
++#if CONFIG_SAND
++ // Rework for sand frames
++ if (av_rpi_is_sand_frame(dst)) {
++ // As it stands the sand formats hold stride2 in linesize[3]
++ // linesize[0] & [1] contain stride1 which is always 128 for everything we do
++ // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
++ dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
++ dst->linesize[0] = 128;
++ dst->linesize[1] = 128;
++ // *** Are we sure src->height is actually what we want ???
++ }
++#endif
+
+ err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
+ &drm_unmap_frame, map);
+@@ -160,7 +221,9 @@ static int drm_map_frame(AVHWFramesConte
+ return 0;
+
+ fail:
+- for (i = 0; i < desc->nb_objects; i++) {
++ for (i = 0; i < AV_DRM_MAX_PLANES; i++) {
++ if (map->fds[i] != -1)
++ dmasync(map->fds[i], DMA_BUF_SYNC_END | map->dmaflags);
+ if (map->address[i])
+ munmap(map->address[i], map->length[i]);
+ }
+@@ -172,16 +235,29 @@ static int drm_transfer_get_formats(AVHW
+ enum AVHWFrameTransferDirection dir,
+ enum AVPixelFormat **formats)
+ {
+- enum AVPixelFormat *pix_fmts;
++ enum AVPixelFormat *p;
+
+- pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
+- if (!pix_fmts)
++ p = *formats = av_malloc_array(3, sizeof(*p));
++ if (!p)
+ return AVERROR(ENOMEM);
+
+- pix_fmts[0] = ctx->sw_format;
+- pix_fmts[1] = AV_PIX_FMT_NONE;
++ // **** Offer native sand too ????
++ *p++ =
++#if CONFIG_SAND
++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
++ AV_PIX_FMT_YUV420P :
++ ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
++ AV_PIX_FMT_YUV420P10LE :
++#endif
++ ctx->sw_format;
++
++#if CONFIG_SAND
++ if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
++ *p++ = AV_PIX_FMT_NV12;
++#endif
+
+- *formats = pix_fmts;
++ *p = AV_PIX_FMT_NONE;
+ return 0;
+ }
+
+@@ -197,18 +273,63 @@ static int drm_transfer_data_from(AVHWFr
+ map = av_frame_alloc();
+ if (!map)
+ return AVERROR(ENOMEM);
+- map->format = dst->format;
+
++ // Map to default
++ map->format = AV_PIX_FMT_NONE;
+ err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
+ if (err)
+ goto fail;
+
+- map->width = dst->width;
+- map->height = dst->height;
++#if 0
++ av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
++ map->hwfc_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
++ map->width, map->height,
++ map->linesize[0],
++ map->linesize[1],
++ map->linesize[2],
++ map->linesize[3],
++ dst->width, dst->height,
++ dst->linesize[0],
++ dst->linesize[1],
++ dst->linesize[2]);
++#endif
++#if CONFIG_SAND
++ if (av_rpi_is_sand_frame(map)) {
++ // Preserve crop - later ffmpeg code assumes that we have in that it
++ // overwrites any crop that we create with the old values
++ unsigned int stride2 = map->linesize[3];
++ const unsigned int w = FFMIN(dst->width, map->width);
++ const unsigned int h = FFMIN(dst->height, map->height);
++
++ map->crop_top = 0;
++ map->crop_bottom = 0;
++ map->crop_left = 0;
++ map->crop_right = 0;
++
++ if (av_rpi_sand_to_planar_frame(dst, map) != 0)
++ {
++ av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
++ err = AVERROR(EINVAL);
++ goto fail;
++ }
++
++ dst->width = w;
++ dst->height = h;
++ }
++ else
++#endif
++ {
++ // Kludge mapped h/w s.t. frame_copy works
++ map->width = dst->width;
++ map->height = dst->height;
++ err = av_frame_copy(dst, map);
++ }
+
+- err = av_frame_copy(dst, map);
+ if (err)
++ {
++ av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
+ goto fail;
++ }
+
+ err = 0;
+ fail:
+@@ -223,7 +344,10 @@ static int drm_transfer_data_to(AVHWFram
+ int err;
+
+ if (src->width > hwfc->width || src->height > hwfc->height)
++ {
++ av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
+ return AVERROR(EINVAL);
++ }
+
+ map = av_frame_alloc();
+ if (!map)
+--- a/libavutil/pixdesc.c
++++ b/libavutil/pixdesc.c
+@@ -2371,6 +2371,50 @@ static const AVPixFmtDescriptor av_pix_f
+ .name = "vulkan",
+ .flags = AV_PIX_FMT_FLAG_HWACCEL,
+ },
++ [AV_PIX_FMT_SAND128] = {
++ .name = "sand128",
++ .nb_components = 3,
++ .log2_chroma_w = 1,
++ .log2_chroma_h = 1,
++ .comp = {
++ { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */
++ { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */
++ { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */
++ },
++ .flags = 0,
++ },
++ [AV_PIX_FMT_SAND64_10] = {
++ .name = "sand64_10",
++ .nb_components = 3,
++ .log2_chroma_w = 1,
++ .log2_chroma_h = 1,
++ .comp = {
++ { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */
++ { 1, 4, 0, 0, 10, 3, 9, 1 }, /* U */
++ { 1, 4, 2, 0, 10, 3, 9, 3 }, /* V */
++ },
++ .flags = 0,
++ },
++ [AV_PIX_FMT_SAND64_16] = {
++ .name = "sand64_16",
++ .nb_components = 3,
++ .log2_chroma_w = 1,
++ .log2_chroma_h = 1,
++ .comp = {
++ { 0, 2, 0, 0, 16, 0, 15, 1 }, /* Y */
++ { 1, 4, 0, 0, 16, 3, 15, 1 }, /* U */
++ { 1, 4, 2, 0, 16, 3, 15, 3 }, /* V */
++ },
++ .flags = 0,
++ },
++ [AV_PIX_FMT_RPI4_8] = {
++ .name = "rpi4_8",
++ .flags = AV_PIX_FMT_FLAG_HWACCEL,
++ },
++ [AV_PIX_FMT_RPI4_10] = {
++ .name = "rpi4_10",
++ .flags = AV_PIX_FMT_FLAG_HWACCEL,
++ },
+ };
+ #if FF_API_PLUS1_MINUS1
+ FF_ENABLE_DEPRECATION_WARNINGS
+--- a/libavutil/pixfmt.h
++++ b/libavutil/pixfmt.h
+@@ -357,6 +357,12 @@ enum AVPixelFormat {
+
+ AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
+ AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
++// RPI - not on ifdef so can be got at by calling progs
++ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++ AV_PIX_FMT_RPI4_8,
++ AV_PIX_FMT_RPI4_10,
+
+ AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+ };
+--- /dev/null
++++ b/libavutil/rpi_sand_fn_pw.h
+@@ -0,0 +1,227 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++// * Included twice from rpi_sand_fn with different PW
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x = _x;
++ const unsigned int w = _w;
++ const unsigned int mask = stride1 - 1;
++
++#if PW == 1 && HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
++ src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if ((x & ~mask) == ((x + w) & ~mask)) {
++ // All in one sand stripe
++ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
++ memcpy(dst, p, w);
++ }
++ }
++ else
++ {
++ // Two+ stripe
++ const unsigned int sstride = stride1 * stride2;
++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ const uint8_t * p2 = p1 + sstride - (x & mask);
++ const unsigned int w1 = stride1 - (x & mask);
++ const unsigned int w3 = (x + w) & mask;
++ const unsigned int w2 = w - (w1 + w3);
++
++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
++ unsigned int j;
++ const uint8_t * p = p2;
++ uint8_t * d = dst;
++ memcpy(d, p1, w1);
++ d += w1;
++ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
++ memcpy(d, p, stride1);
++ }
++ memcpy(d, p, w3);
++ }
++ }
++}
++
++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
++
++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x = _x * 2;
++ const unsigned int w = _w * 2;
++ const unsigned int mask = stride1 - 1;
++
++#if PW == 1 && HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
++ src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if ((x & ~mask) == ((x + w) & ~mask)) {
++ // All in one sand stripe
++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
++ pixel * du = (pixel *)dst_u;
++ pixel * dv = (pixel *)dst_v;
++ const pixel * p = (const pixel *)p1;
++ for (unsigned int k = 0; k < w; k += 2 * PW) {
++ *du++ = *p++;
++ *dv++ = *p++;
++ }
++ }
++ }
++ else
++ {
++ // Two+ stripe
++ const unsigned int sstride = stride1 * stride2;
++ const unsigned int sstride_p = (sstride - stride1) / PW;
++
++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ const uint8_t * p2 = p1 + sstride - (x & mask);
++ const unsigned int w1 = stride1 - (x & mask);
++ const unsigned int w3 = (x + w) & mask;
++ const unsigned int w2 = w - (w1 + w3);
++
++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
++ unsigned int j;
++ const pixel * p = (const pixel *)p1;
++ pixel * du = (pixel *)dst_u;
++ pixel * dv = (pixel *)dst_v;
++ for (unsigned int k = 0; k < w1; k += 2 * PW) {
++ *du++ = *p++;
++ *dv++ = *p++;
++ }
++ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++ for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++ *du++ = *p++;
++ *dv++ = *p++;
++ }
++ }
++ for (unsigned int k = 0; k < w3; k += 2 * PW) {
++ *du++ = *p++;
++ *dv++ = *p++;
++ }
++ }
++ }
++}
++
++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
++ unsigned int stride1, unsigned int stride2,
++ const uint8_t * src_u, const unsigned int src_stride_u,
++ const uint8_t * src_v, const unsigned int src_stride_v,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x = _x * 2;
++ const unsigned int w = _w * 2;
++ const unsigned int mask = stride1 - 1;
++ if ((x & ~mask) == ((x + w) & ~mask)) {
++ // All in one sand stripe
++ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
++ const pixel * su = (const pixel *)src_u;
++ const pixel * sv = (const pixel *)src_v;
++ pixel * p = (pixel *)p1;
++ for (unsigned int k = 0; k < w; k += 2 * PW) {
++ *p++ = *su++;
++ *p++ = *sv++;
++ }
++ }
++ }
++ else
++ {
++ // Two+ stripe
++ const unsigned int sstride = stride1 * stride2;
++ const unsigned int sstride_p = (sstride - stride1) / PW;
++
++ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++ const uint8_t * p2 = p1 + sstride - (x & mask);
++ const unsigned int w1 = stride1 - (x & mask);
++ const unsigned int w3 = (x + w) & mask;
++ const unsigned int w2 = w - (w1 + w3);
++
++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
++ unsigned int j;
++ const pixel * su = (const pixel *)src_u;
++ const pixel * sv = (const pixel *)src_v;
++ pixel * p = (pixel *)p1;
++ for (unsigned int k = 0; k < w1; k += 2 * PW) {
++ *p++ = *su++;
++ *p++ = *sv++;
++ }
++ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++ for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++ *p++ = *su++;
++ *p++ = *sv++;
++ }
++ }
++ for (unsigned int k = 0; k < w3; k += 2 * PW) {
++ *p++ = *su++;
++ *p++ = *sv++;
++ }
++ }
++ }
++}
++
++
++#undef pixel
++#undef STRCAT
++#undef FUNC
++
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.c
+@@ -0,0 +1,445 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "config.h"
++#include <stdint.h>
++#include <string.h>
++#include "rpi_sand_fns.h"
++#include "avassert.h"
++#include "frame.h"
++
++#if ARCH_ARM && HAVE_NEON
++#include "arm/rpi_sand_neon.h"
++#define HAVE_SAND_ASM 1
++#elif ARCH_AARCH64 && HAVE_NEON
++#include "aarch64/rpi_sand_neon.h"
++#define HAVE_SAND_ASM 1
++#else
++#define HAVE_SAND_ASM 0
++#endif
++
++#define PW 1
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#define PW 2
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#if 1
++// Simple round
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++ const unsigned int rnd = (1 << shr) >> 1;
++ const uint16_t * src = (const uint16_t *)_src;
++
++ for (; n != 0; --n) {
++ *dst++ = (*src++ + rnd) >> shr;
++ }
++}
++#else
++// Dithered variation
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++ unsigned int rnd = (1 << shr) >> 1;
++ const unsigned int mask = ((1 << shr) - 1);
++ const uint16_t * src = (const uint16_t *)_src;
++
++ for (; n != 0; --n) {
++ rnd = *src++ + (rnd & mask);
++ *dst++ = rnd >> shr;
++ }
++}
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++ const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++ const unsigned int x1 = ((_x + _w) / 3) * 4;
++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++ const unsigned int mask = stride1 - 1;
++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if (x0 == x1) {
++ // *******************
++ // Partial single word xfer
++ return;
++ }
++
++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++ {
++ unsigned int x = x0;
++ const uint32_t * p = (const uint32_t *)p0;
++ uint16_t * d = (uint16_t *)dst;
++
++ if (xskip0 != 0) {
++ const uint32_t p3 = *p++;
++
++ if (xskip0 == 1)
++ *d++ = (p3 >> 10) & 0x3ff;
++ *d++ = (p3 >> 20) & 0x3ff;
++
++ if (((x += 4) & mask) == 0)
++ p += slice_inc;
++ }
++
++ while (x != x1) {
++ const uint32_t p3 = *p++;
++ *d++ = p3 & 0x3ff;
++ *d++ = (p3 >> 10) & 0x3ff;
++ *d++ = (p3 >> 20) & 0x3ff;
++
++ if (((x += 4) & mask) == 0)
++ p += slice_inc;
++ }
++
++ if (xrem1 != 0) {
++ const uint32_t p3 = *p;
++
++ *d++ = p3 & 0x3ff;
++ if (xrem1 == 2)
++ *d++ = (p3 >> 10) & 0x3ff;
++ }
++ }
++}
++
++
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
++ const unsigned int xskip0 = _x - (x0 >> 3) * 3;
++ const unsigned int x1 = ((_x + _w) / 3) * 8;
++ const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
++ const unsigned int mask = stride1 - 1;
++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
++ src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if (x0 == x1) {
++ // *******************
++ // Partial single word xfer
++ return;
++ }
++
++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
++ {
++ unsigned int x = x0;
++ const uint32_t * p = (const uint32_t *)p0;
++ uint16_t * du = (uint16_t *)dst_u;
++ uint16_t * dv = (uint16_t *)dst_v;
++
++ if (xskip0 != 0) {
++ const uint32_t p3a = *p++;
++ const uint32_t p3b = *p++;
++
++ if (xskip0 == 1)
++ {
++ *du++ = (p3a >> 20) & 0x3ff;
++ *dv++ = (p3b >> 0) & 0x3ff;
++ }
++ *du++ = (p3b >> 10) & 0x3ff;
++ *dv++ = (p3b >> 20) & 0x3ff;
++
++ if (((x += 8) & mask) == 0)
++ p += slice_inc;
++ }
++
++ while (x != x1) {
++ const uint32_t p3a = *p++;
++ const uint32_t p3b = *p++;
++
++ *du++ = p3a & 0x3ff;
++ *dv++ = (p3a >> 10) & 0x3ff;
++ *du++ = (p3a >> 20) & 0x3ff;
++ *dv++ = p3b & 0x3ff;
++ *du++ = (p3b >> 10) & 0x3ff;
++ *dv++ = (p3b >> 20) & 0x3ff;
++
++ if (((x += 8) & mask) == 0)
++ p += slice_inc;
++ }
++
++ if (xrem1 != 0) {
++ const uint32_t p3a = *p++;
++ const uint32_t p3b = *p++;
++
++ *du++ = p3a & 0x3ff;
++ *dv++ = (p3a >> 10) & 0x3ff;
++ if (xrem1 == 2)
++ {
++ *du++ = (p3a >> 20) & 0x3ff;
++ *dv++ = p3b & 0x3ff;
++ }
++ }
++ }
++}
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// single lose bottom 2 bits truncation
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h)
++{
++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++ const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++ const unsigned int x1 = ((_x + _w) / 3) * 4;
++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++ const unsigned int mask = stride1 - 1;
++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++ if (_x == 0) {
++ ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++ return;
++ }
++#endif
++
++ if (x0 == x1) {
++ // *******************
++ // Partial single word xfer
++ return;
++ }
++
++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++ {
++ unsigned int x = x0;
++ const uint32_t * p = (const uint32_t *)p0;
++ uint8_t * d = dst;
++
++ if (xskip0 != 0) {
++ const uint32_t p3 = *p++;
++
++ if (xskip0 == 1)
++ *d++ = (p3 >> 12) & 0xff;
++ *d++ = (p3 >> 22) & 0xff;
++
++ if (((x += 4) & mask) == 0)
++ p += slice_inc;
++ }
++
++ while (x != x1) {
++ const uint32_t p3 = *p++;
++ *d++ = (p3 >> 2) & 0xff;
++ *d++ = (p3 >> 12) & 0xff;
++ *d++ = (p3 >> 22) & 0xff;
++
++ if (((x += 4) & mask) == 0)
++ p += slice_inc;
++ }
++
++ if (xrem1 != 0) {
++ const uint32_t p3 = *p;
++
++ *d++ = (p3 >> 2) & 0xff;
++ if (xrem1 == 2)
++ *d++ = (p3 >> 12) & 0xff;
++ }
++ }
++}
++
++
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++ unsigned int w, unsigned int h, const unsigned int shr)
++{
++ const unsigned int n = dst_stride1 / 2;
++ unsigned int j;
++
++ // This is true for our current layouts
++ av_assert0(dst_stride1 == src_stride1);
++
++ // As we have the same stride1 for src & dest and src is wider than dest
++ // then if we loop on src we can always write contiguously to dest
++ // We make no effort to copy an exact width - round up to nearest src stripe
++ // as we will always have storage in dest for that
++
++#if ARCH_ARM && HAVE_NEON
++ if (shr == 3 && src_stride1 == 128) {
++ for (j = 0; j + n < w; j += dst_stride1) {
++ uint8_t * d = dst + j * dst_stride2;
++ const uint8_t * s1 = src + j * 2 * src_stride2;
++ const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++ ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
++ }
++ }
++ else
++#endif
++ {
++ for (j = 0; j + n < w; j += dst_stride1) {
++ uint8_t * d = dst + j * dst_stride2;
++ const uint8_t * s1 = src + j * 2 * src_stride2;
++ const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
++ cpy16_to_8(d, s1, n, shr);
++ cpy16_to_8(d + n, s2, n, shr);
++ }
++ }
++ }
++
++ // Fix up a trailing dest half stripe
++ if (j < w) {
++ uint8_t * d = dst + j * dst_stride2;
++ const uint8_t * s1 = src + j * 2 * src_stride2;
++
++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
++ cpy16_to_8(d, s1, n, shr);
++ }
++ }
++}
++
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
++{
++ const int w = av_frame_cropped_width(src);
++ const int h = av_frame_cropped_height(src);
++ const int x = src->crop_left;
++ const int y = src->crop_top;
++
++ // We will crop as part of the conversion
++ dst->crop_top = 0;
++ dst->crop_left = 0;
++ dst->crop_bottom = 0;
++ dst->crop_right = 0;
++
++ switch (src->format){
++ case AV_PIX_FMT_SAND128:
++ case AV_PIX_FMT_RPI4_8:
++ switch (dst->format){
++ case AV_PIX_FMT_YUV420P:
++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y, w, h);
++ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
++ dst->data[2], dst->linesize[2],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x/2, y/2, w/2, h/2);
++ break;
++ case AV_PIX_FMT_NV12:
++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y, w, h);
++ av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x/2, y/2, w, h/2);
++ break;
++ default:
++ return -1;
++ }
++ break;
++ case AV_PIX_FMT_SAND64_10:
++ switch (dst->format){
++ case AV_PIX_FMT_YUV420P10:
++ av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x*2, y, w*2, h);
++ av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
++ dst->data[2], dst->linesize[2],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y/2, w, h/2);
++ break;
++ default:
++ return -1;
++ }
++ break;
++ case AV_PIX_FMT_RPI4_10:
++ switch (dst->format){
++ case AV_PIX_FMT_YUV420P10:
++ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y, w, h);
++ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
++ dst->data[2], dst->linesize[2],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x/2, y/2, w/2, h/2);
++ break;
++ case AV_PIX_FMT_NV12:
++ av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
++ src->data[0],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x, y, w, h);
++ av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
++ src->data[1],
++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++ x/2, y/2, w, h/2);
++ break;
++ default:
++ return -1;
++ }
++ break;
++ default:
++ return -1;
++ }
++
++ return av_frame_copy_props(dst, src);
++}
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.h
+@@ -0,0 +1,188 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef AVUTIL_RPI_SAND_FNS
++#define AVUTIL_RPI_SAND_FNS
++
++#include "libavutil/frame.h"
++
++// For all these fns _x & _w are measured as coord * PW
++// For the C fns coords are in chroma pels (so luma / 2)
++// Strides are in bytes
++
++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
++ unsigned int stride1, unsigned int stride2,
++ const uint8_t * src_u, const unsigned int src_stride_u,
++ const uint8_t * src_v, const unsigned int src_stride_v,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
++ unsigned int stride1, unsigned int stride2,
++ const uint8_t * src_u, const unsigned int src_stride_u,
++ const uint8_t * src_v, const unsigned int src_stride_v,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++ uint8_t * dst_v, const unsigned int dst_stride_v,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++ const uint8_t * src,
++ unsigned int stride1, unsigned int stride2,
++ unsigned int _x, unsigned int y,
++ unsigned int _w, unsigned int h);
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++ unsigned int w, unsigned int h, const unsigned int shr);
++
++
++// dst must contain required pixel format & allocated data buffers
++// Cropping on the src buffer will be honoured and dst crop will be set to zero
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
++
++
++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
++{
++#ifdef RPI_ZC_SAND128_ONLY
++ // If we are sure we only only support 128 byte sand formats replace the
++ // var with a constant which should allow for better optimisation
++ return 128;
++#else
++ return frame->linesize[0];
++#endif
++}
++
++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
++{
++ return frame->linesize[3];
++}
++
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++ return av_rpi_is_sand_format(frame->format);
++}
++
++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
++{
++ return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
++}
++
++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
++{
++ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
++{
++ return (frame->format == AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
++{
++ return av_rpi_is_sand8_frame(frame) ? 0 : 1;
++}
++
++// If x is measured in bytes (not pixels) then this works for sand64_16 as
++// well as sand128 - but in the general case we work that out
++
++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
++{
++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
++ const unsigned int x1 = x & (stride1 - 1);
++ const unsigned int x2 = x ^ x1;
++
++ return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
++ const unsigned int x1 = x & (stride1 - 1);
++ const unsigned int x2 = x ^ x1;
++
++ return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
++}
++
++#endif
++
+--- /dev/null
++++ b/pi-util/BUILD.txt
+@@ -0,0 +1,59 @@
++Building Pi FFmpeg
++==================
++
++Current only building on a Pi is supported.
++This builds ffmpeg the way I've tested it
++
++Get all dependencies - the current package dependencies are good enough
++
++$ sudo apt-get build-dep ffmpeg
++
++Configure using the pi-util/conf_native.sh script
++-------------------------------------------------
++
++This sets the normal release options and creates an ouutput dir to build into
++The directory name will depend on system and options but will be under out/
++
++There are a few choices here
++ --mmal build including the legacy mmal-based decoders and zero-copy code
++ this requires appropriate libraries which currently will exist for
++ armv7 but not arm64
++ --noshared
++ Build a static image rather than a shared library one. Static is
++ easier for testing as there is no need to worry about library
++ paths being confused and therefore running the wrong code, Shared
++ is what is needed, in most cases, when building for use by other
++ programs.
++
++So for a static build
++---------------------
++
++$ pi-util/conf_native.sh --noshared
++
++$ make -j8 -C out/<wherever the script said it was building to>
++
++You can now run ffmpeg directly from where it was built
++
++For a shared build
++------------------
++
++$ pi-util/conf_native.sh
++
++You will normally want an install target if shared. Note that the script has
++set this up to be generated in out/<builddir>/install, you don't have to worry
++about overwriting your system libs.
++
++$ make -j8 -C out/<builddir> install
++
++You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
++built or install the image on the system - you have to be careful to get rid
++of all other ffmpeg libs or confusion may result. There is a little script
++that wipes all other versions - obviously use with care!
++
++$ sudo pi-util/clean_usr_libs.sh
++
++Then simply copying from the install to /usr works
++
++$ sudo cp -r out/<builddir>/install/* /usr
++
++
+--- /dev/null
++++ b/pi-util/NOTES.txt
+@@ -0,0 +1,69 @@
++Notes on the hevc_rpi decoder & associated support code
++-------------------------------------------------------
++
++There are 3 main parts to the existing code:
++
++1) The decoder - this is all in libavcodec as rpi_hevc*.
++
++2) A few filters to deal with Sand frames and a small patch to
++automatically select the sand->i420 converter when required.
++
++3) A kludge in ffmpeg.c to display the decoded video. This could & should
++be converted into a proper ffmpeg display module.
++
++
++Decoder
++-------
++
++The decoder is a modified version of the existing ffmpeg hevc decoder.
++Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder.
++More complex bitstreams can be up to ~200% faster but particularly easy
++streams can cut its advantage down to ~50%. This means that a Pi3+ can
++display nearly all 8-bit 1080p30 streams and with some overclocking it can
++display most lower bitrate 10-bit 1080p30 streams - this latter case is
++not helped by the requirement to downsample to 8-bit before display on a
++Pi.
++
++It has had co-processor offload added for inter-pred and large block
++residual transform. Various parts have had optimized ARM NEON assembler
++added and the existing ARM asm sections have been profiled and
++re-optimized for A53. The main C code has been substantially reworked at
++its lower levels in an attempt to optimize it and minimize memory
++bandwidth. To some extent code paths that deal with frame types that it
++doesn't support have been pruned.
++
++It outputs frames in Broadcom Sand format. This is a somewhat annoying
++layout that doesn't fit into ffmpegs standard frame descriptions. It has
++vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for
++the stripe followed by interleaved U & V, that is then followed by the Y
++for the next stripe, etc. The final stripe is always padded to
++stripe-width. This is used in an attempt to help with cache locality and
++cut down on the number of dram bank switches. It is annoying to use for
++inter-pred with conventional processing but the way the Pi QPU (which is
++used for inter-pred) works means that it has negligible downsides here and
++the improved memory performance exceeds the overhead of the increased
++complexity in the rest of the code.
++
++Frames must be allocated out of GPU memory (as otherwise they can't be
++accessed by the co-processors). Utility functions (in rpi_zc.c) have been
++written to make this easier. As the frames are already in GPU memory they
++can be displayed by the Pi h/w without any further copying.
++
++
++Known non-features
++------------------
++
++Frame allocation should probably be done in some other way in order to fit
++into the standard framework better.
++
++Sand frames are currently declared as software frames, there is an
++argument that they should be hardware frames but they aren't really.
++
++There must be a better way of auto-selecting the hevc_rpi decoder over the
++normal s/w hevc decoder, but I became confused by the existing h/w
++acceleration framework and what I wanted to do didn't seem to fit in
++neatly.
++
++Display should be a proper device rather than a kludge in ffmpeg.c
++
++
+--- /dev/null
++++ b/pi-util/TESTMESA.txt
+@@ -0,0 +1,82 @@
++# Setup & Build instructions for testing Argon30 mesa support (on Pi4)
++
++# These assume that the drm_mmal test for Sand8 has been built on this Pi
++# as build relies on many of the same files
++
++# 1st get everything required to build ffmpeg
++# If sources aren't already enabled on your Pi then enable them
++sudo su
++sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list
++sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list
++mv /tmp/sources.list /etc/apt/
++mv /tmp/raspi.list /etc/apt/sources.list.d/
++apt update
++
++# Get dependancies
++sudo apt build-dep ffmpeg
++
++sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev
++
++# Enable H265 V4L2 request decoder
++sudo su
++echo dtoverlay=rpivid-v4l2 >> /boot/config.txt
++# You may also want to add more CMA if you are going to try 4k videos
++# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read
++# dtoverlay=vc4-fkms-v3d,cma-512
++reboot
++# Check it has turned up
++ls -la /dev/video*
++# This should include video19
++# crw-rw----+ 1 root video 81, 7 Aug 4 17:25 /dev/video19
++
++# Currently on the Pi the linux headers from the debian distro don't match
++# the kernel that we ship and we need to update them - hopefully this step
++# will be unneeded in the future
++sudo apt install git bc bison flex libssl-dev make
++git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y
++cd linux
++KERNEL=kernel7l
++make bcm2711_defconfig
++make headers_install
++sudo cp -r usr/include/linux /usr/include
++cd ..
++
++# Config - this builds a staticly linked ffmpeg which is easier for testing
++pi-util/conf_native.sh --noshared
++
++# Build (this is a bit dull)
++# If you want to poke the source the libavdevice/egl_vout.c contains the
++# output code -
++cd out/armv7-static-rel
++
++# Check that you have actually configured V4L2 request
++grep HEVC_V4L2REQUEST config.h
++# You are hoping for
++# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1
++# if you get 0 then the config has failed
++
++make -j6
++
++# Grab test streams
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv
++
++# Test i420 output (works currently)
++./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl -
++
++# Test Sand8 output - doesn't currently work but should once you have
++# Sand8 working in drm_mmal. I can't guarantee that this will work as
++# I can't test this path with a known working format, but the debug looks
++# good. If this doesn't work & drm_mmal does with sand8 then come back to me
++# The "show_all 1" forces vout to display every frame otherwise it drops any
++# frame that would cause it to block
++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl -
++
++# Test Sand30 - doesn't currently work
++# (Beware that when FFmpeg errors out it often leaves your teminal window
++# in a state where you need to reset it)
++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl -
++
++
++
+--- /dev/null
++++ b/pi-util/clean_usr_libs.sh
+@@ -0,0 +1,26 @@
++set -e
++U=/usr/lib/arm-linux-gnueabihf
++rm -f $U/libavcodec.*
++rm -f $U/libavdevice.*
++rm -f $U/libavfilter.*
++rm -f $U/libavformat.*
++rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
++U=/usr/lib/arm-linux-gnueabihf/neon/vfp
++rm -f $U/libavcodec.*
++rm -f $U/libavdevice.*
++rm -f $U/libavfilter.*
++rm -f $U/libavformat.*
++rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
++U=/usr/lib/aarch64-linux-gnu
++rm -f $U/libavcodec.*
++rm -f $U/libavdevice.*
++rm -f $U/libavfilter.*
++rm -f $U/libavformat.*
++rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
++
+--- /dev/null
++++ b/pi-util/conf_arm64_native.sh
+@@ -0,0 +1,45 @@
++echo "Configure for ARM64 native build"
++
++#RPI_KEEPS="-save-temps=obj"
++
++SHARED_LIBS="--enable-shared"
++if [ "$1" == "--noshared" ]; then
++ SHARED_LIBS="--disable-shared"
++ echo Static libs
++ OUT=out/arm64-static-rel
++else
++ echo Shared libs
++ OUT=out/arm64-shared-rel
++fi
++
++mkdir -p $OUT
++cd $OUT
++
++A=aarch64-linux-gnu
++USR_PREFIX=`pwd`/install
++LIB_PREFIX=$USR_PREFIX/lib/$A
++INC_PREFIX=$USR_PREFIX/include/$A
++
++../../configure \
++ --prefix=$USR_PREFIX\
++ --libdir=$LIB_PREFIX\
++ --incdir=$INC_PREFIX\
++ --disable-stripping\
++ --disable-thumb\
++ --disable-mmal\
++ --enable-sand\
++ --enable-v4l2-request\
++ --enable-libdrm\
++ --enable-epoxy\
++ --enable-libudev\
++ --enable-vout-drm\
++ --enable-vout-egl\
++ $SHARED_LIBS\
++ --extra-cflags="-ggdb"
++
++# --enable-decoder=hevc_rpi\
++# --enable-extra-warnings\
++# --arch=armv71\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+--- /dev/null
++++ b/pi-util/conf_h265.2016.csv
+@@ -0,0 +1,195 @@
++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
++1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
++1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
++1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
++0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
++1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
++1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
++1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
+--- /dev/null
++++ b/pi-util/conf_h265.2016_HEVC_v1.csv
+@@ -0,0 +1,147 @@
++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+--- /dev/null
++++ b/pi-util/conf_h265.csv
+@@ -0,0 +1,144 @@
++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+--- /dev/null
++++ b/pi-util/conf_native.sh
+@@ -0,0 +1,106 @@
++echo "Configure for native build"
++
++FFSRC=`pwd`
++MC=`dpkg --print-architecture`
++BUILDBASE=$FFSRC/out
++
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++NOSHARED=
++MMAL=
++
++while [ "$1" != "" ] ; do
++ case $1 in
++ --noshared)
++ NOSHARED=1
++ ;;
++ --mmal)
++ MMAL=1
++ ;;
++ *)
++ echo "Usage $0: [--noshared] [--mmal]"
++ exit 1
++ ;;
++ esac
++ shift
++done
++
++
++MCOPTS=
++RPI_INCLUDES=
++RPI_LIBDIRS=
++RPI_DEFINES=
++RPI_EXTRALIBS=
++
++if [ "$MC" == "arm64" ]; then
++ echo "M/C aarch64"
++ A=aarch64-linux-gnu
++ B=arm64
++elif [ "$MC" == "armhf" ]; then
++ echo "M/C armv7"
++ A=arm-linux-gnueabihf
++ B=armv7
++ MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
++ RPI_DEFINES=-mfpu=neon-vfpv4
++else
++ echo Unexpected architecture $MC
++ exit 1
++fi
++
++if [ $MMAL ]; then
++ RPI_OPT_VC=/opt/vc
++ RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++ RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
++ RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
++ RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
++ RPIOPTS="--enable-mmal --enable-rpi"
++else
++ RPIOPTS="--disable-mmal --enable-sand"
++fi
++
++C=`lsb_release -sc`
++V=`cat RELEASE`
++
++SHARED_LIBS="--enable-shared"
++if [ $NOSHARED ]; then
++ SHARED_LIBS="--disable-shared"
++ OUT=$BUILDBASE/$B-$C-$V-static-rel
++ echo Static libs
++else
++ echo Shared libs
++ OUT=$BUILDBASE/$B-$C-$V-shared-rel
++fi
++
++USR_PREFIX=$OUT/install
++LIB_PREFIX=$USR_PREFIX/lib/$A
++INC_PREFIX=$USR_PREFIX/include/$A
++
++echo Destination directory: $OUT
++mkdir -p $OUT
++# Nothing under here need worry git - including this .gitignore!
++echo "**" > $BUILDBASE/.gitignore
++cd $OUT
++
++$FFSRC/configure \
++ --prefix=$USR_PREFIX\
++ --libdir=$LIB_PREFIX\
++ --incdir=$INC_PREFIX\
++ $MCOPTS\
++ --disable-stripping\
++ --disable-thumb\
++ --enable-v4l2-request\
++ --enable-libdrm\
++ --enable-vout-egl\
++ --enable-vout-drm\
++ $SHARED_LIBS\
++ $RPIOPTS\
++ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS"\
++ --extra-libs="$RPI_EXTRALIBS"\
++ --extra-version="rpi"
++
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+--- /dev/null
++++ b/pi-util/ffconf.py
+@@ -0,0 +1,215 @@
++#!/usr/bin/env python3
++
++import string
++import os
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++CODEC_HEVC_RPI = 1
++HWACCEL_RPI = 2
++HWACCEL_DRM = 3
++HWACCEL_VAAPI = 4
++
++def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec, ffmpeg_exec):
++ hwaccel = ""
++ if dectype == HWACCEL_RPI:
++ hwaccel = "rpi"
++ elif dectype == HWACCEL_DRM:
++ hwaccel = "drm"
++ elif dectype == HWACCEL_VAAPI:
++ hwaccel = "vaapi"
++
++ pix_fmt = []
++ if pix == "8":
++ pix_fmt = ["-pix_fmt", "yuv420p"]
++ elif pix == "10":
++ pix_fmt = ["-pix_fmt", "yuv420p10le"]
++ elif pix == "12":
++ pix_fmt = ["-pix_fmt", "yuv420p12le"]
++
++ tmp_root = "/tmp"
++
++ names = srcname.split('/')
++ while len(names) > 1:
++ tmp_root = os.path.join(tmp_root, names[0])
++ del names[0]
++ name = names[0]
++
++ if not os.path.exists(tmp_root):
++ os.makedirs(tmp_root)
++
++ dec_file = os.path.join(tmp_root, name + ".dec.md5")
++ try:
++ os.remove(dec_file)
++ except:
++ pass
++
++ flog = open(os.path.join(tmp_root, name + ".log"), "wt")
++
++ ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file]
++
++ # Unaligned needed for cropping conformance
++ if hwaccel:
++ rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT)
++ else:
++ rstr = subprocess.call(
++ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
++ stdout=flog, stderr=subprocess.STDOUT)
++
++ try:
++ m1 = None
++ m2 = None
++ with open(os.path.join(fileroot, md5_file)) as f:
++ for line in f:
++ m1 = re.search("[0-9a-f]{32}", line.lower())
++ if m1:
++ break
++
++ with open(dec_file) as f:
++ m2 = re.search("[0-9a-f]{32}", f.readline())
++ except:
++ pass
++
++ if m1 and m2 and m1.group() == m2.group():
++ print("Match: " + m1.group(), file=flog)
++ rv = 0
++ elif not m1:
++ print("****** Cannot find m1", file=flog)
++ rv = 3
++ elif not m2:
++ print("****** Cannot find m2", file=flog)
++ rv = 2
++ else:
++ print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog)
++ rv = 1
++ flog.close()
++ return rv
++
++def scandir(root):
++ aconf = []
++ ents = os.listdir(root)
++ ents.sort(key=str.lower)
++ for name in ents:
++ test_path = os.path.join(root, name)
++ if S_ISDIR(os.stat(test_path).st_mode):
++ files = os.listdir(test_path)
++ es_file = "?"
++ md5_file = "?"
++ for f in files:
++ (base, ext) = os.path.splitext(f)
++ if base[0] == '.':
++ pass
++ elif ext == ".bit" or ext == ".bin":
++ es_file = f
++ elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
++ if md5_file == "?":
++ md5_file = f
++ elif base[-3:] == "yuv":
++ md5_file = f
++ aconf.append((1, name, es_file, md5_file))
++ return aconf
++
++def runtest(name, tests):
++ if not tests:
++ return True
++ for t in tests:
++ if name[0:len(t)] == t or name.find("/" + t) != -1:
++ return True
++ return False
++
++def doconf(csva, tests, test_root, vcodec, dectype, ffmpeg_exec):
++ unx_failures = []
++ unx_success = []
++ failures = 0
++ successes = 0
++ for a in csva:
++ exp_test = int(a[0])
++ if (exp_test and runtest(a[1], tests)):
++ name = a[1]
++ print ("==== ", name, end="")
++ sys.stdout.flush()
++
++ rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec)
++ if (rv == 0):
++ successes += 1
++ else:
++ failures += 1
++
++ if (rv == 0):
++ if exp_test == 2:
++ print(": * OK *")
++ unx_success.append(name)
++ else:
++ print(": ok")
++ elif exp_test == 2 and rv == 1:
++ print(": fail")
++ elif exp_test == 3 and rv == 2:
++ # Call an expected "crash" an abort
++ print(": abort")
++ else:
++ unx_failures.append(name)
++ if rv == 1:
++ print(": * FAIL *")
++ elif (rv == 2) :
++ print(": * CRASH *")
++ elif (rv == 3) :
++ print(": * MD5 MISSING *")
++ else :
++ print(": * BANG *")
++
++ if unx_failures or unx_success:
++ print("Unexpected Failures:", unx_failures)
++ print("Unexpected Success: ", unx_success)
++ else:
++ print("All tests normal:", successes, "ok,", failures, "failed")
++
++
++class ConfCSVDialect(csv.Dialect):
++ delimiter = ','
++ doublequote = True
++ lineterminator = '\n'
++ quotechar='"'
++ quoting = csv.QUOTE_MINIMAL
++ skipinitialspace = True
++ strict = True
++
++if __name__ == '__main__':
++
++ argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
++ argp.add_argument("tests", nargs='*')
++ argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line")
++ argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line")
++ argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line")
++ argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
++ argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
++ argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
++ argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
++ argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
++ args = argp.parse_args()
++
++ if args.csvgen:
++ csv.writer(sys.stdout).writerows(scandir(args.test_root))
++ exit(0)
++
++ with open(args.csv, 'rt') as csvfile:
++ csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
++
++ dectype = CODEC_HEVC_RPI
++ if os.path.exists("/dev/rpivid-hevcmem"):
++ dectype = HWACCEL_RPI
++ if args.drm or os.path.exists("/sys/module/rpivid_hevc"):
++ dectype = HWACCEL_DRM
++
++ if args.pi4:
++ dectype = HWACCEL_RPI
++ elif args.drm:
++ dectype = HWACCEL_DRM
++ elif args.vaapi:
++ dectype = HWACCEL_VAAPI
++
++ doconf(csva, args.tests, args.test_root, args.vcodec, dectype, args.ffmpeg)
++
+--- /dev/null
++++ b/pi-util/ffperf.py
+@@ -0,0 +1,128 @@
++#!/usr/bin/env python3
++
++import time
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++ close_threshold = 0.01
++
++ def __init__(self, stats_dict=None):
++ if stats_dict != None:
++ self.name = stats_dict["name"]
++ self.elapsed = float(stats_dict["elapsed"])
++ self.user = float(stats_dict["user"])
++ self.sys = float(stats_dict["sys"])
++
++ def times_str(self):
++ ctime = self.sys + self.user
++ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++ def dict(self):
++ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++ def is_close(self, other):
++ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++ def __lt__(self, other):
++ return self.elapsed < other.elapsed
++ def __gt__(self, other):
++ return self.elapsed > other.elapsed
++
++ def time_file(name, prefix, ffmpeg="./ffmpeg"):
++ stats = tstats()
++ stats.name = name
++ start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++ cproc = subprocess.Popen([ffmpeg, "-no_cvt_hw",
++ "-vcodec", "hevc_rpi",
++ "-t", "30", "-i", prefix + name,
++ "-f", "vout_rpi", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++ pinfo = os.wait4(cproc.pid, 0)
++ end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++ stats.elapsed = end_time - start_time
++ stats.user = pinfo[2].ru_utime
++ stats.sys = pinfo[2].ru_stime
++ return stats
++
++
++def common_prefix(s1, s2):
++ for i in range(min(len(s1),len(s2))):
++ if s1[i] != s2[i]:
++ return s1[:i]
++ return s1[:i+1]
++
++def main():
++ global flog
++
++ argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
++To blank the screen before starting use "xdg-screensaver activate"
++(For some reason this doesn't seem to work from within python).
++""")
++
++ argp.add_argument("streams", nargs='*')
++ argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
++ argp.add_argument("--csv_in", help="CSV input filename")
++ argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
++ argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
++ argp.add_argument("--ffmpeg", default="./ffmpeg", help="FFmpeg executable")
++
++ args = argp.parse_args()
++
++ csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
++ csv_out.writeheader()
++
++ stats_in = {}
++ if args.csv_in != None:
++ with open(args.csv_in, 'r', newline='') as f_in:
++ stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++
++ flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
++
++ streams = args.streams
++ if not streams:
++ if not stats_in:
++ print ("No source streams specified")
++ return 1
++ prefix = "" if args.prefix == None else args.prefix
++ streams = [k for k in stats_in]
++ elif args.prefix != None:
++ prefix = args.prefix
++ else:
++ prefix = streams[0]
++ for f in streams[1:]:
++ prefix = common_prefix(prefix, f)
++ pp = prefix.rpartition(os.sep)
++ prefix = pp[0] + pp[1]
++ streams = [s[len(prefix):] for s in streams]
++
++ for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
++ print ("====", f)
++
++ t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
++ for i in range(args.repeat):
++ t = tstats.time_file(f, prefix, args.ffmpeg)
++ print ("...", t.times_str())
++ if t0 > t:
++ t0 = t
++
++ if t0.name in stats_in:
++ pstat = stats_in[t0.name]
++ print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
++
++ csv_out.writerow(t0.dict())
++
++ print ()
++
++ return 0
++
++
++if __name__ == '__main__':
++ exit(main())
++
+--- /dev/null
++++ b/pi-util/genpatch.sh
+@@ -0,0 +1,35 @@
++set -e
++
++NOPATCH=
++if [ "$1" == "--notag" ]; then
++ shift
++ NOPATCH=1
++fi
++
++if [ "$1" == "" ]; then
++ echo Usage: $0 [--notag] \<patch_tag\>
++ echo e.g.: $0 mmal_4
++ exit 1
++fi
++
++VERSION=`cat RELEASE`
++if [ "$VERSION" == "" ]; then
++ echo Can\'t find version RELEASE
++ exit 1
++fi
++
++PATCHFILE=../ffmpeg-$VERSION-$1.patch
++
++if [ $NOPATCH ]; then
++ echo Not tagged
++else
++ # Only continue if we are all comitted
++ git diff --name-status --exit-code
++
++ PATCHTAG=pi/$VERSION/$1
++ echo Tagging: $PATCHTAG
++
++ git tag $PATCHTAG
++fi
++echo Generating patch: $PATCHFILE
++git diff n$VERSION -- > $PATCHFILE
+--- /dev/null
++++ b/pi-util/make_array.py
+@@ -0,0 +1,23 @@
++#!/usr/bin/env python
++
++# Usage
++# make_array file.bin
++# Produces file.h with array of bytes.
++#
++import sys
++for file in sys.argv[1:]:
++ prefix,suffix = file.split('.')
++ assert suffix=='bin'
++ name=prefix.split('/')[-1]
++ print 'Converting',file
++ with open(prefix+'.h','wb') as out:
++ print >>out, 'static const unsigned char',name,'[] = {'
++ with open(file,'rb') as fd:
++ i = 0
++ for byte in fd.read():
++ print >>out, '0x%02x, ' % ord(byte),
++ i = i + 1
++ if i % 8 == 0:
++ print >>out, ' // %04x' % (i - 8)
++ print >>out,'};'
++
+--- /dev/null
++++ b/pi-util/mkinst.sh
+@@ -0,0 +1,5 @@
++set -e
++
++make install
++
++cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr
+--- /dev/null
++++ b/pi-util/patkodi.sh
+@@ -0,0 +1,9 @@
++set -e
++KODIBASE=/home/jc/rpi/kodi/xbmc
++JOBS=-j20
++make $JOBS
++git diff xbmc/release/4.3-kodi > $KODIBASE/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++make -C $KODIBASE/tools/depends/target/ffmpeg $JOBS
++make -C $KODIBASE/build install
++
++
+--- /dev/null
++++ b/pi-util/perfcmp.py
+@@ -0,0 +1,101 @@
++#!/usr/bin/env python3
++
++import time
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++ close_threshold = 0.01
++
++ def __init__(self, stats_dict=None):
++ if stats_dict != None:
++ self.name = stats_dict["name"]
++ self.elapsed = float(stats_dict["elapsed"])
++ self.user = float(stats_dict["user"])
++ self.sys = float(stats_dict["sys"])
++
++ def times_str(self):
++ ctime = self.sys + self.user
++ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++ def dict(self):
++ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++ def is_close(self, other):
++ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++ def __lt__(self, other):
++ return self.elapsed < other.elapsed
++ def __gt__(self, other):
++ return self.elapsed > other.elapsed
++
++ def time_file(name, prefix):
++ stats = tstats()
++ stats.name = name
++ start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++ cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
++ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++ pinfo = os.wait4(cproc.pid, 0)
++ end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++ stats.elapsed = end_time - start_time
++ stats.user = pinfo[2].ru_utime
++ stats.sys = pinfo[2].ru_stime
++ return stats
++
++
++def common_prefix(s1, s2):
++ for i in range(min(len(s1),len(s2))):
++ if s1[i] != s2[i]:
++ return s1[:i]
++ return s1[:i+1]
++
++def main():
++ argp = argparse.ArgumentParser(description="FFmpeg performance compare")
++
++ argp.add_argument("stream0", help="CSV to compare")
++ argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
++
++ args = argp.parse_args()
++
++ with open(args.stream0, 'r', newline='') as f_in:
++ stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++ with open(args.stream1, 'r', newline='') as f_in:
++ stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++
++ print (args.stream0, "<<-->>", args.stream1)
++ print ()
++
++ for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
++ if not (f in stats0) :
++ print (" XX :", f)
++ continue
++ if not (f in stats1) :
++ print (" XX :", f)
++ continue
++
++ s0 = stats0[f]
++ s1 = stats1[f]
++
++ pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
++ thresh = 0.3
++ tc = 6
++
++ nchar = min(tc - 1, int(abs(pcent) / thresh))
++ cc = " -- " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
++
++ print ("%6.2f %s%6.2f (%+5.2f) : %s" %
++ (s0.elapsed, cc, s1.elapsed, pcent, f))
++
++ return 0
++
++
++if __name__ == '__main__':
++ exit(main())
++
+--- /dev/null
++++ b/pi-util/qem.sh
+@@ -0,0 +1,9 @@
++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
++QASM=python\ ../local/bin/qasm.py
++SRC_FILE=libavcodec/rpi_hevc_shader.qasm
++DST_BASE=shader
++
++cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
++
+--- /dev/null
++++ b/pi-util/v3dusage.py
+@@ -0,0 +1,128 @@
++#!/usr/bin/env python
++
++import sys
++import argparse
++import re
++
++def do_logparse(logname):
++
++ rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
++ rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
++ rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
++ rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
++
++ ttotal = {'idle':0.0}
++ tstart = {}
++ qctotal = {}
++ qtstotal = {}
++ l2hits = {}
++ l2total = {}
++ time0 = None
++ idle_start = None
++ qpu_op_no = 0
++ op_count = 0
++
++ with open(logname, "rt") as infile:
++ for line in infile:
++ match = rmatch.match(line)
++ if match:
++# print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
++ time = float(match.group(1))
++ unit = match.group(3)
++ opstart = not match.group(2)
++ optype = match.group(7)
++ hascb = match.group(8) != "0"
++
++ if unit == 'qpu1':
++ unit = unit + "." + str(qpu_op_no)
++ if not opstart:
++ if hascb or optype == 'EXECUTE_SYNC':
++ qpu_op_no = 0
++ else:
++ qpu_op_no += 1
++
++ # Ignore sync type
++ if optype == 'EXECUTE_SYNC':
++ continue
++
++ if not time0:
++ time0 = time
++
++ if opstart:
++ tstart[unit] = time;
++ elif unit in tstart:
++ op_count += 1
++ if not unit in ttotal:
++ ttotal[unit] = 0.0
++ ttotal[unit] += time - tstart[unit]
++ del tstart[unit]
++
++ if not idle_start and not tstart:
++ idle_start = time
++ elif idle_start and tstart:
++ ttotal['idle'] += time - idle_start
++ idle_start = None
++
++ match = rqcycle.match(line)
++ if match:
++ unit = "qpu1." + str(qpu_op_no)
++ if not unit in qctotal:
++ qctotal[unit] = 0
++ qctotal[unit] += int(match.group(2))
++
++ match = rqtscycle.match(line)
++ if match:
++ unit = "qpu1." + str(qpu_op_no)
++ if not unit in qtstotal:
++ qtstotal[unit] = 0
++ qtstotal[unit] += int(match.group(2))
++
++ match = rl2hits.match(line)
++ if match:
++ unit = "qpu1." + str(qpu_op_no)
++ if not unit in l2total:
++ l2total[unit] = 0
++ l2hits[unit] = 0
++ l2total[unit] += int(match.group(3))
++ if match.group(2) == "hits":
++ l2hits[unit] += int(match.group(3))
++
++
++ if not time0:
++ print "No v3d profile records found"
++ else:
++ tlogged = time - time0
++
++ print "Logged time:", tlogged, " Op count:", op_count
++ for unit in sorted(ttotal):
++ print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
++ print
++ for unit in sorted(qctotal):
++ if not unit in qtstotal:
++ qtstotal[unit] = 0;
++ print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
++ if unit in l2total:
++ print b' L2Total: %10d, hits: %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
++
++
++
++if __name__ == '__main__':
++ argp = argparse.ArgumentParser(
++ formatter_class=argparse.RawDescriptionHelpFormatter,
++ description="QPU/VPU perf summary from VC logging",
++ epilog = """
++Will also summarise TMU stalls if logging requests set in qpu noflush param
++in the profiled code.
++
++Example use:
++ vcgencmd set_logging level=0xc0
++ <command to profile>
++ sudo vcdbg log msg >& t.log
++ v3dusage.py t.log
++""")
++
++ argp.add_argument("logfile")
++ args = argp.parse_args()
++
++ do_logparse(args.logfile)
++
+--- a/tests/checkasm/Makefile
++++ b/tests/checkasm/Makefile
+@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)
+ AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o
+ AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o
+ AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o
++AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDENCDSP) += llviddspenc.o
++AVCODECOBJS-$(CONFIG_VC1DSP) += vc1dsp.o
+ AVCODECOBJS-$(CONFIG_VP8DSP) += vp8dsp.o
+ AVCODECOBJS-$(CONFIG_VIDEODSP) += videodsp.o
+
+--- a/tests/checkasm/checkasm.c
++++ b/tests/checkasm/checkasm.c
+@@ -121,6 +121,9 @@ static const struct {
+ #if CONFIG_HUFFYUV_DECODER
+ { "huffyuvdsp", checkasm_check_huffyuvdsp },
+ #endif
++ #if CONFIG_IDCTDSP
++ { "idctdsp", checkasm_check_idctdsp },
++ #endif
+ #if CONFIG_JPEG2000_DECODER
+ { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
+ #endif
+@@ -145,6 +148,9 @@ static const struct {
+ #if CONFIG_V210_ENCODER
+ { "v210enc", checkasm_check_v210enc },
+ #endif
++ #if CONFIG_VC1DSP
++ { "vc1dsp", checkasm_check_vc1dsp },
++ #endif
+ #if CONFIG_VP8DSP
+ { "vp8dsp", checkasm_check_vp8dsp },
+ #endif
+--- a/tests/checkasm/checkasm.h
++++ b/tests/checkasm/checkasm.h
+@@ -60,6 +60,7 @@ void checkasm_check_hevc_add_res(void);
+ void checkasm_check_hevc_idct(void);
+ void checkasm_check_hevc_sao(void);
+ void checkasm_check_huffyuvdsp(void);
++void checkasm_check_idctdsp(void);
+ void checkasm_check_jpeg2000dsp(void);
+ void checkasm_check_llviddsp(void);
+ void checkasm_check_llviddspenc(void);
+@@ -73,6 +74,7 @@ void checkasm_check_sw_scale(void);
+ void checkasm_check_utvideodsp(void);
+ void checkasm_check_v210dec(void);
+ void checkasm_check_v210enc(void);
++void checkasm_check_vc1dsp(void);
+ void checkasm_check_vf_eq(void);
+ void checkasm_check_vf_gblur(void);
+ void checkasm_check_vf_hflip(void);
+--- /dev/null
++++ b/tests/checkasm/idctdsp.c
+@@ -0,0 +1,98 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/idctdsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
++
++typedef struct {
++ const char *name;
++ size_t offset;
++} test;
++
++#define RANDOMIZE_BUFFER16(name, size) \
++ do { \
++ int i; \
++ for (i = 0; i < size; ++i) { \
++ uint16_t r = rnd() % 0x201 - 0x100; \
++ AV_WN16A(name##0 + i, r); \
++ AV_WN16A(name##1 + i, r); \
++ } \
++ } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size) \
++ do { \
++ int i; \
++ for (i = 0; i < size; ++i) { \
++ uint8_t r = rnd(); \
++ name##0[i] = r; \
++ name##1[i] = r; \
++ } \
++ } while (0)
++
++static void check_add_put_clamped(void)
++{
++ /* Source buffers are only as big as needed, since any over-read won't affect results */
++ LOCAL_ALIGNED_16(int16_t, src0, [64]);
++ LOCAL_ALIGNED_16(int16_t, src1, [64]);
++ /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */
++ LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
++ LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
++
++ AVCodecContext avctx = { 0 };
++ IDCTDSPContext h;
++
++ const test tests[] = {
++ IDCTDSP_TEST(add_pixels_clamped)
++ IDCTDSP_TEST(put_pixels_clamped)
++ IDCTDSP_TEST(put_signed_pixels_clamped)
++ };
++
++ ff_idctdsp_init(&h, &avctx);
++
++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++ void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset);
++ if (check_func(func, "idctdsp.%s", tests[t].name)) {
++ declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t);
++ RANDOMIZE_BUFFER16(src, 64);
++ RANDOMIZE_BUFFER8(dst, 10 * 24);
++ call_ref(src0, dst0 + 24 + 8, 24);
++ call_new(src1, dst1 + 24 + 8, 24);
++ if (memcmp(dst0, dst1, 10 * 24))
++ fail();
++ bench_new(src1, dst1 + 24 + 8, 24);
++ }
++ }
++}
++
++void checkasm_check_idctdsp(void)
++{
++ check_add_put_clamped();
++ report("idctdsp");
++}
+--- /dev/null
++++ b/tests/checkasm/vc1dsp.c
+@@ -0,0 +1,452 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/vc1dsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
++#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
++
++typedef struct {
++ const char *name;
++ size_t offset;
++ int width;
++ int height;
++} test;
++
++typedef struct matrix {
++ size_t width;
++ size_t height;
++ float d[];
++} matrix;
++
++static const matrix T8 = { 8, 8, {
++ 12, 12, 12, 12, 12, 12, 12, 12,
++ 16, 15, 9, 4, -4, -9, -15, -16,
++ 16, 6, -6, -16, -16, -6, 6, 16,
++ 15, -4, -16, -9, 9, 16, 4, -15,
++ 12, -12, -12, 12, 12, -12, -12, 12,
++ 9, -16, 4, 15, -15, -4, 16, -9,
++ 6, -16, 16, -6, -6, 16, -16, 6,
++ 4, -9, 15, -16, 16, -15, 9, -4
++} };
++
++static const matrix T4 = { 4, 4, {
++ 17, 17, 17, 17,
++ 22, 10, -10, -22,
++ 17, -17, -17, 17,
++ 10, -22, 22, -10
++} };
++
++static const matrix T8t = { 8, 8, {
++ 12, 16, 16, 15, 12, 9, 6, 4,
++ 12, 15, 6, -4, -12, -16, -16, -9,
++ 12, 9, -6, -16, -12, 4, 16, 15,
++ 12, 4, -16, -9, 12, 15, -6, -16,
++ 12, -4, -16, 9, 12, -15, -6, 16,
++ 12, -9, -6, 16, -12, -4, 16, -15,
++ 12, -15, 6, 4, -12, 16, -16, 9,
++ 12, -16, 16, -15, 12, -9, 6, -4
++} };
++
++static const matrix T4t = { 4, 4, {
++ 17, 22, 17, 10,
++ 17, 10, -17, -22,
++ 17, -10, -17, 22,
++ 17, -22, 17, -10
++} };
++
++static matrix *new_matrix(size_t width, size_t height)
++{
++ matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
++ if (out == NULL) {
++ fprintf(stderr, "Memory allocation failure\n");
++ exit(EXIT_FAILURE);
++ }
++ out->width = width;
++ out->height = height;
++ return out;
++}
++
++static matrix *multiply(const matrix *a, const matrix *b)
++{
++ matrix *out;
++ if (a->width != b->height) {
++ fprintf(stderr, "Incompatible multiplication\n");
++ exit(EXIT_FAILURE);
++ }
++ out = new_matrix(b->width, a->height);
++ for (int j = 0; j < out->height; ++j)
++ for (int i = 0; i < out->width; ++i) {
++ float sum = 0;
++ for (int k = 0; k < a->width; ++k)
++ sum += a->d[j * a->width + k] * b->d[k * b->width + i];
++ out->d[j * out->width + i] = sum;
++ }
++ return out;
++}
++
++static void normalise(matrix *a)
++{
++ for (int j = 0; j < a->height; ++j)
++ for (int i = 0; i < a->width; ++i) {
++ float *p = a->d + j * a->width + i;
++ *p *= 64;
++ if (a->height == 4)
++ *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
++ else
++ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
++ if (a->width == 4)
++ *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
++ else
++ *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
++ }
++}
++
++static void divide_and_round_nearest(matrix *a, float by)
++{
++ for (int j = 0; j < a->height; ++j)
++ for (int i = 0; i < a->width; ++i) {
++ float *p = a->d + j * a->width + i;
++ *p = rintf(*p / by);
++ }
++}
++
++static void tweak(matrix *a)
++{
++ for (int j = 4; j < a->height; ++j)
++ for (int i = 0; i < a->width; ++i) {
++ float *p = a->d + j * a->width + i;
++ *p += 1;
++ }
++}
++
++/* The VC-1 spec places restrictions on the values permitted at three
++ * different stages:
++ * - D: the input coefficients in frequency domain
++ * - E: the intermediate coefficients, inverse-transformed only horizontally
++ * - R: the fully inverse-transformed coefficients
++ *
++ * To fully cater for the ranges specified requires various intermediate
++ * values to be held to 17-bit precision; yet these conditions do not appear
++ * to be utilised in real-world streams. At least some assembly
++ * implementations have chosen to restrict these values to 16-bit precision,
++ * to accelerate the decoding of real-world streams at the cost of strict
++ * adherence to the spec. To avoid our test marking these as failures,
++ * reduce our random inputs.
++ */
++#define ATTENUATION 4
++
++static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
++{
++ matrix *raw, *tmp, *D, *E, *R;
++ raw = new_matrix(width, height);
++ for (int i = 0; i < width * height; ++i)
++ raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
++ tmp = multiply(height == 8 ? &T8 : &T4, raw);
++ D = multiply(tmp, width == 8 ? &T8t : &T4t);
++ normalise(D);
++ divide_and_round_nearest(D, 1);
++ for (int i = 0; i < width * height; ++i) {
++ if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
++ /* Rare, so simply try again */
++ av_free(raw);
++ av_free(tmp);
++ av_free(D);
++ return generate_inverse_quantized_transform_coefficients(width, height);
++ }
++ }
++ E = multiply(D, width == 8 ? &T8 : &T4);
++ divide_and_round_nearest(E, 8);
++ for (int i = 0; i < width * height; ++i)
++ if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
++ /* Rare, so simply try again */
++ av_free(raw);
++ av_free(tmp);
++ av_free(D);
++ av_free(E);
++ return generate_inverse_quantized_transform_coefficients(width, height);
++ }
++ R = multiply(height == 8 ? &T8t : &T4t, E);
++ tweak(R);
++ divide_and_round_nearest(R, 128);
++ for (int i = 0; i < width * height; ++i)
++ if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
++ /* Rare, so simply try again */
++ av_free(raw);
++ av_free(tmp);
++ av_free(D);
++ av_free(E);
++ av_free(R);
++ return generate_inverse_quantized_transform_coefficients(width, height);
++ }
++ av_free(raw);
++ av_free(tmp);
++ av_free(E);
++ av_free(R);
++ return D;
++}
++
++#define RANDOMIZE_BUFFER16(name, size) \
++ do { \
++ int i; \
++ for (i = 0; i < size; ++i) { \
++ uint16_t r = rnd(); \
++ AV_WN16A(name##0 + i, r); \
++ AV_WN16A(name##1 + i, r); \
++ } \
++ } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size) \
++ do { \
++ int i; \
++ for (i = 0; i < size; ++i) { \
++ uint8_t r = rnd(); \
++ name##0[i] = r; \
++ name##1[i] = r; \
++ } \
++ } while (0)
++
++#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \
++ do { \
++ uint8_t *p##0 = name##0, *p##1 = name##1; \
++ int i = (size); \
++ while (i-- > 0) { \
++ int x = 0x80 | (rnd() & 0x7F); \
++ x >>= rnd() % 9; \
++ if (rnd() & 1) \
++ x = -x; \
++ *p##1++ = *p##0++ = 0x80 + x; \
++ } \
++ } while (0)
++
++static void check_inv_trans_inplace(void)
++{
++ /* Inverse transform input coefficients are stored in a 16-bit buffer
++ * with row stride of 8 coefficients irrespective of transform size.
++ * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
++ * are stored in column-major order, and the outputs are written back
++ * to the input buffer, so we oversize it slightly to catch overruns. */
++ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
++ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
++
++ VC1DSPContext h;
++
++ ff_vc1dsp_init(&h);
++
++ if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
++ matrix *coeffs;
++ declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
++ RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
++ coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
++ for (int j = 0; j < 8; ++j)
++ for (int i = 0; i < 8; ++i) {
++ int idx = 8 + i * 8 + j;
++ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
++ }
++ call_ref(inv_trans_in0 + 8);
++ call_new(inv_trans_in1 + 8);
++ if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t)))
++ fail();
++ bench_new(inv_trans_in1 + 8);
++ av_free(coeffs);
++ }
++}
++
++static void check_inv_trans_adding(void)
++{
++ /* Inverse transform input coefficients are stored in a 16-bit buffer
++ * with row stride of 8 coefficients irrespective of transform size. */
++ LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
++ LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
++
++ /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
++ * added with saturation to an array of unsigned 8-bit values. Oversize
++ * this by 8 samples left and right and one row above and below. */
++ LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
++ LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
++
++ VC1DSPContext h;
++
++ const test tests[] = {
++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
++ VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
++ };
++
++ ff_vc1dsp_init(&h);
++
++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++ void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
++ if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++ matrix *coeffs;
++ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
++ RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
++ RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
++ coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
++ for (int j = 0; j < tests[t].height; ++j)
++ for (int i = 0; i < tests[t].width; ++i) {
++ int idx = j * 8 + i;
++ inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
++ }
++ call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
++ call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
++ if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
++ fail();
++ bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
++ av_free(coeffs);
++ }
++ }
++}
++
++static void check_loop_filter(void)
++{
++ /* Deblocking filter buffers are big enough to hold a 16x16 block,
++ * plus 16 columns left and 4 rows above to hold filter inputs
++ * (depending on whether v or h neighbouring block edge, oversized
++ * horizontally to maintain 16-byte alignment) plus 16 columns and
++ * 4 rows below to catch write overflows */
++ LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
++ LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
++
++ VC1DSPContext h;
++
++ const test tests[] = {
++ VC1DSP_TEST(vc1_v_loop_filter4)
++ VC1DSP_TEST(vc1_h_loop_filter4)
++ VC1DSP_TEST(vc1_v_loop_filter8)
++ VC1DSP_TEST(vc1_h_loop_filter8)
++ VC1DSP_TEST(vc1_v_loop_filter16)
++ VC1DSP_TEST(vc1_h_loop_filter16)
++ };
++
++ ff_vc1dsp_init(&h);
++
++ for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++ void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
++ declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
++ if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++ for (int count = 1000; count > 0; --count) {
++ int pq = rnd() % 31 + 1;
++ RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
++ call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
++ call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
++ if (memcmp(filter_buf0, filter_buf1, 24 * 48))
++ fail();
++ }
++ }
++ for (int j = 0; j < 24; ++j)
++ for (int i = 0; i < 48; ++i)
++ filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
++ if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
++ bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
++ if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
++ bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
++ }
++}
++
++#define TEST_UNESCAPE \
++ do { \
++ for (int count = 100; count > 0; --count) { \
++ escaped_offset = rnd() & 7; \
++ unescaped_offset = rnd() & 7; \
++ escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7); \
++ RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE); \
++ len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
++ len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
++ if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE)) \
++ fail(); \
++ } \
++ } while (0)
++
++static void check_unescape(void)
++{
++ /* This appears to be a typical length of buffer in use */
++#define LOG2_UNESCAPE_BUF_SIZE 17
++#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
++ LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
++ LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
++ LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
++ LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
++
++ VC1DSPContext h;
++
++ ff_vc1dsp_init(&h);
++
++ if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
++ int len0, len1, escaped_offset, unescaped_offset, escaped_len;
++ declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
++
++ /* Test data which consists of escapes sequences packed as tightly as possible */
++ for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
++ escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
++ TEST_UNESCAPE;
++
++ /* Test random data */
++ RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
++ TEST_UNESCAPE;
++
++ /* Test data with escape sequences at random intervals */
++ for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
++ int gap, gap_msb;
++ escaped1[x+0] = escaped0[x+0] = 0;
++ escaped1[x+1] = escaped0[x+1] = 0;
++ escaped1[x+2] = escaped0[x+2] = 3;
++ escaped1[x+3] = escaped0[x+3] = rnd() & 3;
++ gap_msb = 2u << (rnd() % 8);
++ gap = (rnd() &~ -gap_msb) | gap_msb;
++ x += gap;
++ }
++ TEST_UNESCAPE;
++
++ /* Test data which is known to contain no escape sequences */
++ memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
++ memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
++ TEST_UNESCAPE;
++
++ /* Benchmark the no-escape-sequences case */
++ bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
++ }
++}
++
++void checkasm_check_vc1dsp(void)
++{
++ check_inv_trans_inplace();
++ check_inv_trans_adding();
++ report("inv_trans");
++
++ check_loop_filter();
++ report("loop_filter");
++
++ check_unescape();
++ report("unescape_buffer");
++}
+--- a/tests/fate/checkasm.mak
++++ b/tests/fate/checkasm.mak
+@@ -16,6 +16,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
+ fate-checkasm-hevc_add_res \
+ fate-checkasm-hevc_idct \
+ fate-checkasm-hevc_sao \
++ fate-checkasm-idctdsp \
+ fate-checkasm-jpeg2000dsp \
+ fate-checkasm-llviddsp \
+ fate-checkasm-llviddspenc \
+@@ -27,6 +28,7 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp
+ fate-checkasm-sw_scale \
+ fate-checkasm-v210dec \
+ fate-checkasm-v210enc \
++ fate-checkasm-vc1dsp \
+ fate-checkasm-vf_blend \
+ fate-checkasm-vf_colorspace \
+ fate-checkasm-vf_eq \
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff b/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff
new file mode 100644
index 0000000..ab6f139
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/0005-fix-flags.diff
@@ -0,0 +1,22 @@
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+--- a/configure
++++ b/configure
+@@ -6471,11 +6471,9 @@ enabled mbedtls && { check_pkg
+ die "ERROR: mbedTLS not found"; }
+ enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
+ ( enabled rpi ||
+- enabled mmal ) && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
+- { ! enabled cross_compile &&
+- add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+- add_ldflags -L/opt/vc/lib/ &&
+- check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
++ enabled mmal ) && { { add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
++ add_ldflags -L/opt/vc/lib/ &&
++ check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcsm -lvchostif -lvchiq_arm -lvcos; } ||
+ die "ERROR: mmal not found" &&
+ check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
+ enabled openal && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch
new file mode 100644
index 0000000..f153827
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2001-configure-setup-for-OE-core-usage.patch
@@ -0,0 +1,82 @@
+From 01e738a8f1414acd0102e432bbc15b4e603fd956 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 10:34:20 -0600
+Subject: [PATCH] configure: setup for OE-core usage
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Add global CFLAGS and LDFLAGS. So, that when
+./configure runs test it's able to locate proper
+headers and libs in a cross-compile environment.
+
+Add new check to opengl. None of the above headers
+exists and we also should be using GLESv2.
+
+Update where compiler finds OMX_Core.h
+
+Only check that sdl2 version greater than 2.0.1
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ configure | 16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+diff --git a/configure b/configure
+index 723b81f1..0c7f2654 100755
+--- a/configure
++++ b/configure
+@@ -5746,6 +5746,9 @@ enable_weak_pic() {
+ }
+
+ enabled pic && enable_weak_pic
++# Set CFLAGS and LDFLAGS globally
++add_cflags -I${sysroot}/usr/include/ -I${sysroot}/usr/include/IL -I${sysroot}/usr/include/drm
++add_ldflags -L${sysroot}/usr/lib/
+
+ test_cc <<EOF || die "Symbol mangling check failed."
+ int ff_extern;
+@@ -6471,8 +6474,7 @@ enabled mbedtls && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt
+ die "ERROR: mbedTLS not found"; }
+ enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
+ ( enabled rpi ||
+- enabled mmal ) && { { add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+- add_ldflags -L/opt/vc/lib/ &&
++ enabled mmal ) && { { add_cflags -I${sysroot}/usr/include/interface/vmcs_host/linux -I${sysroot}/usr/include/interface/vcos/pthreads -fgnu89-inline &&
+ check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcsm -lvchostif -lvchiq_arm -lvcos; } ||
+ die "ERROR: mmal not found" &&
+ check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
+@@ -6492,15 +6494,15 @@ enabled opengl && { check_lib opengl GL/glx.h glXGetProcAddress "-lGL
+ check_lib opengl windows.h wglGetProcAddress "-lopengl32 -lgdi32" ||
+ check_lib opengl OpenGL/gl3.h glGetError "-Wl,-framework,OpenGL" ||
+ check_lib opengl ES2/gl.h glGetError "-isysroot=${sysroot} -Wl,-framework,OpenGLES" ||
++ check_lib opengl GLES2/gl2.h glGetError "-lGLESv2" ||
+ die "ERROR: opengl not found."
+ }
+-enabled omx_rpi && { test_code cc OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame ||
++enabled omx_rpi && { test_code cc IL/OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame ||
+ { ! enabled cross_compile &&
+- add_cflags -isystem/opt/vc/include/IL &&
+- test_code cc OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame; } ||
++ test_code cc IL/OMX_Core.h OMX_IndexConfigBrcmVideoRequestIFrame; } ||
+ die "ERROR: OpenMAX IL headers from raspberrypi/firmware not found"; } &&
+ enable omx
+-enabled omx && require_headers OMX_Core.h
++enabled omx && require_headers IL/OMX_Core.h
+ enabled openssl && { check_pkg_config openssl openssl openssl/ssl.h OPENSSL_init_ssl ||
+ check_pkg_config openssl openssl openssl/ssl.h SSL_library_init ||
+ check_lib openssl openssl/ssl.h OPENSSL_init_ssl -lssl -lcrypto ||
+@@ -6540,7 +6542,7 @@ fi
+
+ if enabled sdl2; then
+ SDL2_CONFIG="${cross_prefix}sdl2-config"
+- test_pkg_config sdl2 "sdl2 >= 2.0.1 sdl2 < 2.1.0" SDL_events.h SDL_PollEvent
++ test_pkg_config sdl2 "sdl2 >= 2.0.1" SDL_events.h SDL_PollEvent
+ if disabled sdl2 && "${SDL2_CONFIG}" --version > /dev/null 2>&1; then
+ sdl2_cflags=$("${SDL2_CONFIG}" --cflags)
+ sdl2_extralibs=$("${SDL2_CONFIG}" --libs)
+--
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch
new file mode 100644
index 0000000..43a9191
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch
@@ -0,0 +1,111 @@
+From be426ad76c3e486f1364dd292cf8e1c633c80e91 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 10:39:47 -0600
+Subject: [PATCH] libavdevice: opengl_enc.c update dynamic function loader
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+For meta-raspberrypi ffmpeg builds, when opengl
+is enabled do_compile will fail. Reasion is that
+glGetProcAddress is undefined in either GLES2/gl2.h
+or GLES2/gl2ext.h.
+
+define SelectedGetProcAddress to SDL_GL_GetProcAddress
+if sdl2 is included. If not included, define function
+pointers at compile time versus runtime.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ libavdevice/opengl_enc.c | 44 ++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 40 insertions(+), 4 deletions(-)
+
+diff --git a/libavdevice/opengl_enc.c b/libavdevice/opengl_enc.c
+index 2bdb8da7..eabc1bf8 100644
+--- a/libavdevice/opengl_enc.c
++++ b/libavdevice/opengl_enc.c
+@@ -37,12 +37,13 @@
+ #include <OpenGL/gl3.h>
+ #elif HAVE_ES2_GL_H
+ #include <ES2/gl.h>
+-#else
+-#include <GL/gl.h>
+-#include <GL/glext.h>
+ #endif
+ #if HAVE_GLXGETPROCADDRESS
+ #include <GL/glx.h>
++#else
++#define GL_GLEXT_PROTOTYPES
++#include <GLES2/gl2.h>
++#include <GLES2/gl2ext.h>
+ #endif
+
+ #if CONFIG_SDL2
+@@ -493,8 +494,14 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl)
+
+ #if HAVE_GLXGETPROCADDRESS
+ #define SelectedGetProcAddress glXGetProcAddress
++#define CAN_DYNAMIC_LOAD 1
+ #elif HAVE_WGLGETPROCADDRESS
+ #define SelectedGetProcAddress wglGetProcAddress
++#elif CONFIG_SDL2
++#define SelectedGetProcAddress SDL_GL_GetProcAddress
++#define CAN_DYNAMIC_LOAD 1
++#else
++#define CAN_DYNAMIC_LOAD 0
+ #endif
+
+ #define LOAD_OPENGL_FUN(name, type) \
+@@ -504,7 +511,8 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl)
+ return AVERROR(ENOSYS); \
+ }
+
+-#if CONFIG_SDL2
++#if CAN_DYNAMIC_LOAD
++#if CONFIG_SDL2
+ if (!opengl->no_window)
+ return opengl_sdl_load_procedures(opengl);
+ #endif
+@@ -534,9 +542,37 @@ static int av_cold opengl_load_procedures(OpenGLContext *opengl)
+ LOAD_OPENGL_FUN(glGetShaderInfoLog, FF_PFNGLGETSHADERINFOLOGPROC)
+ LOAD_OPENGL_FUN(glEnableVertexAttribArray, FF_PFNGLENABLEVERTEXATTRIBARRAYPROC)
+ LOAD_OPENGL_FUN(glVertexAttribPointer, FF_PFNGLVERTEXATTRIBPOINTERPROC)
++#else
++ procs->glActiveTexture = glActiveTexture;
++ procs->glGenBuffers = glGenBuffers;
++ procs->glDeleteBuffers = glDeleteBuffers;
++ procs->glBufferData = glBufferData;
++ procs->glBindBuffer = glBindBuffer;
++ procs->glGetAttribLocation = glGetAttribLocation;
++ procs->glGetUniformLocation = glGetUniformLocation;
++ procs->glUniform1f = glUniform1f;
++ procs->glUniform1i = glUniform1i;
++ procs->glUniformMatrix4fv = glUniformMatrix4fv;
++ procs->glCreateProgram = glCreateProgram;
++ procs->glDeleteProgram = glDeleteProgram;
++ procs->glUseProgram = glUseProgram;
++ procs->glLinkProgram = glLinkProgram;
++ procs->glGetProgramiv = glGetProgramiv;
++ procs->glGetProgramInfoLog = glGetProgramInfoLog;
++ procs->glAttachShader = glAttachShader;
++ procs->glCreateShader = glCreateShader;
++ procs->glDeleteShader = glDeleteShader;
++ procs->glCompileShader = glCompileShader;
++ procs->glShaderSource = glShaderSource;
++ procs->glGetShaderiv = glGetShaderiv;
++ procs->glGetShaderInfoLog = glGetShaderInfoLog;
++ procs->glEnableVertexAttribArray = glEnableVertexAttribArray;
++ procs->glVertexAttribPointer = (FF_PFNGLVERTEXATTRIBPOINTERPROC) glVertexAttribPointer;
++#endif
+
+ return 0;
+
++#undef CAN_DYNAMIC_LOAD
+ #undef SelectedGetProcAddress
+ #undef LOAD_OPENGL_FUN
+ }
+--
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch
new file mode 100644
index 0000000..2232c48
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2003-libavcodec-fix-v4l2_req_devscan.patch
@@ -0,0 +1,45 @@
+From 62c2f041890a6e20770350721a0a2138d0b38634 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Sat, 3 Dec 2022 23:35:51 -0600
+Subject: [PATCH] libavcodec: fix v4l2_req_devscan.h
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Fixes minor differences between v4l2_req_devscan.c
+and v4l2_req_devscan.h after all patches have been
+applied.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ libavcodec/v4l2_req_devscan.h | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h
+index 0baef365..cd9c49ac 100644
+--- a/libavcodec/v4l2_req_devscan.h
++++ b/libavcodec/v4l2_req_devscan.h
+@@ -1,6 +1,8 @@
+ #ifndef _DEVSCAN_H_
+ #define _DEVSCAN_H_
+
++#include <stdint.h>
++
+ struct devscan;
+ struct decdev;
+ enum v4l2_buf_type;
+@@ -13,7 +15,8 @@ const char *decdev_video_path(const struct decdev *const dev);
+ enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
+ uint32_t decdev_src_pixelformat(const struct decdev *const dev);
+
+-const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
++const struct decdev *devscan_find(struct devscan *const scan,
++ const uint32_t src_fmt_v4l2);
+
+ int devscan_build(void * const dc, struct devscan **pscan);
+ void devscan_delete(struct devscan **const pScan);
+--
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch b/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch
new file mode 100644
index 0000000..02c07de
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/files/2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch
@@ -0,0 +1,35 @@
+From 0dfb56e12fa709794525cda1471091f6699905d5 Mon Sep 17 00:00:00 2001
+From: Vincent Davis Jr <vince@underview.tech>
+Date: Thu, 8 Dec 2022 10:49:03 -0600
+Subject: [PATCH] libavcodec: omx replace /opt/vc path with /usr/lib
+
+Upstream-Status: Inappropriate
+
+RPI-Distro repo clones original ffmpeg and applies patches to enable
+raspiberry pi support.
+
+Configures omx.c for OE usages as libbcm_host.so
+and libopenmaxil.so are located in a different
+location.
+
+Signed-off-by: Vincent Davis Jr <vince@underview.tech>
+---
+ libavcodec/omx.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/libavcodec/omx.c b/libavcodec/omx.c
+index 0a6a3083..8c6e9193 100644
+--- a/libavcodec/omx.c
++++ b/libavcodec/omx.c
+@@ -141,7 +141,7 @@ static av_cold OMXContext *omx_init(void *logctx, const char *libname, const cha
+ {
+ static const char * const libnames[] = {
+ #if CONFIG_OMX_RPI
+- "/opt/vc/lib/libopenmaxil.so", "/opt/vc/lib/libbcm_host.so",
++ "/usr/lib/libopenmaxil.so", "/usr/lib/libbcm_host.so",
+ #else
+ "libOMX_Core.so", NULL,
+ "libOmxCore.so", NULL,
+--
+2.38.1
+
diff --git a/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb b/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb
new file mode 100644
index 0000000..5a8ff8f
--- /dev/null
+++ b/recipes-multimedia/rpidistro-ffmpeg/rpidistro-ffmpeg_4.3.4.bb
@@ -0,0 +1,198 @@
+SUMMARY = "A complete, cross-platform solution to record, convert and stream audio and video."
+DESCRIPTION = "FFmpeg is the leading multimedia framework, able to decode, encode, transcode, \
+ mux, demux, stream, filter and play pretty much anything that humans and machines \
+ have created. It supports the most obscure ancient formats up to the cutting edge."
+HOMEPAGE = "https://www.ffmpeg.org/"
+SECTION = "libs"
+
+LICENSE = "GPL-2.0-or-later & LGPL-2.1-or-later & ISC & MIT & BSD-2-Clause & BSD-3-Clause & IJG"
+LICENSE:${PN} = "GPL-2.0-or-later"
+LICENSE:libavcodec = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libavdevice = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libavfilter = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libavformat = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libavutil = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libpostproc = "GPL-2.0-or-later"
+LICENSE:libswresample = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE:libswscale = "${@bb.utils.contains('PACKAGECONFIG', 'gpl', 'GPL-2.0-or-later', 'LGPL-2.1-or-later', d)}"
+LICENSE_FLAGS = "commercial"
+
+LIC_FILES_CHKSUM = "file://COPYING.GPLv2;md5=b234ee4d69f5fce4486a80fdaf4a4263 \
+ file://COPYING.GPLv3;md5=d32239bcb673463ab874e80d47fae504 \
+ file://COPYING.LGPLv2.1;md5=bd7a443320af8c812e4c18d1b79df004 \
+ file://COPYING.LGPLv3;md5=e6a600fd5e1d9cbde2d983680233ad02"
+
+# Build fails when thumb is enabled: https://bugzilla.yoctoproject.org/show_bug.cgi?id=7717
+ARM_INSTRUCTION_SET:armv4 = "arm"
+ARM_INSTRUCTION_SET:armv5 = "arm"
+ARM_INSTRUCTION_SET:armv6 = "arm"
+# Should be API compatible with libav (which was a fork of ffmpeg)
+# libpostproc was previously packaged from a separate recipe
+PROVIDES = "ffmpeg libav libpostproc"
+RPROVIDES:${PN} = "${PROVIDES}"
+DEPENDS = "nasm-native"
+
+inherit autotools pkgconfig
+PACKAGECONFIG ??= "avdevice avfilter avcodec avformat swresample swscale postproc avresample ffplay \
+ v4l2 drm udev alsa bzlib lzma pic pthreads shared theora zlib libvorbis x264 gpl \
+ ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', 'mmal rpi sand vout-drm', d)} \
+ ${@bb.utils.contains('AVAILTUNES', 'mips32r2', 'mips32r2', '', d)} \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'opengl', 'opengl', '', d)} \
+ ${@bb.utils.contains('DISTRO_FEATURES', 'x11', 'xv xcb vout-egl epoxy', '', d)}"
+
+SRC_URI = "\
+ git://git@github.com/RPi-Distro/ffmpeg;protocol=https;branch=pios/bullseye \
+ file://0001-avcodec-arm-sbcenc-avoid-callee-preserved-vfp-regist.patch \
+ file://0002-Fix-build-on-powerpc-and-ppc64.patch \
+ file://0003-avcodec-pngenc-remove-monowhite-from-apng-formats.patch \
+ file://0004-ffmpeg-4.3.4-rpi_14.patch \
+ file://0005-fix-flags.diff \
+ file://2001-configure-setup-for-OE-core-usage.patch \
+ file://2002-libavdevice-opengl_enc-update-dynamic-function-loader.patch \
+ file://2003-libavcodec-fix-v4l2_req_devscan.patch \
+ file://2004-libavcodec-omx-replace-opt-vc-path-with-usr-lib.patch \
+ "
+
+SRCREV = "246e1a55a0eca931537d8706acd8b133c07beb05"
+
+S = "${WORKDIR}/git"
+
+# libraries to build in addition to avutil
+PACKAGECONFIG[avdevice] = "--enable-avdevice,--disable-avdevice"
+PACKAGECONFIG[avfilter] = "--enable-avfilter,--disable-avfilter"
+PACKAGECONFIG[avcodec] = "--enable-avcodec,--disable-avcodec"
+PACKAGECONFIG[avformat] = "--enable-avformat,--disable-avformat"
+PACKAGECONFIG[swresample] = "--enable-swresample,--disable-swresample"
+PACKAGECONFIG[swscale] = "--enable-swscale,--disable-swscale"
+PACKAGECONFIG[postproc] = "--enable-postproc,--disable-postproc"
+PACKAGECONFIG[avresample] = "--enable-avresample,--disable-avresample"
+
+# features to support
+PACKAGECONFIG[ffplay] = "--enable-ffplay,--disable-ffplay"
+PACKAGECONFIG[alsa] = "--enable-alsa,--disable-alsa,alsa-lib"
+PACKAGECONFIG[altivec] = "--enable-altivec,--disable-altivec,"
+PACKAGECONFIG[bzlib] = "--enable-bzlib,--disable-bzlib,bzip2"
+PACKAGECONFIG[fdk-aac] = "--enable-libfdk-aac --enable-nonfree,--disable-libfdk-aac,fdk-aac"
+PACKAGECONFIG[gpl] = "--enable-gpl,--disable-gpl"
+PACKAGECONFIG[opengl] = "--enable-opengl,--disable-opengl,virtual/libgles2"
+PACKAGECONFIG[gsm] = "--enable-libgsm,--disable-libgsm,libgsm"
+PACKAGECONFIG[jack] = "--enable-indev=jack,--disable-indev=jack,jack"
+PACKAGECONFIG[libvorbis] = "--enable-libvorbis,--disable-libvorbis,libvorbis"
+PACKAGECONFIG[libopus] = "--enable-libopus,--disable-libopus,libopus"
+PACKAGECONFIG[lzma] = "--enable-lzma,--disable-lzma,xz"
+PACKAGECONFIG[mfx] = "--enable-libmfx,--disable-libmfx,intel-mediasdk"
+PACKAGECONFIG[mp3lame] = "--enable-libmp3lame,--disable-libmp3lame,lame"
+PACKAGECONFIG[openssl] = "--enable-openssl,--disable-openssl,openssl"
+PACKAGECONFIG[sdl2] = "--enable-sdl2,--disable-sdl2,virtual/libsdl2"
+PACKAGECONFIG[speex] = "--enable-libspeex,--disable-libspeex,speex"
+PACKAGECONFIG[srt] = "--enable-libsrt,--disable-libsrt,srt"
+PACKAGECONFIG[theora] = "--enable-libtheora,--disable-libtheora,libtheora libogg"
+PACKAGECONFIG[vaapi] = "--enable-vaapi,--disable-vaapi,libva"
+PACKAGECONFIG[vdpau] = "--enable-vdpau,--disable-vdpau,libvdpau"
+PACKAGECONFIG[vpx] = "--enable-libvpx,--disable-libvpx,libvpx"
+PACKAGECONFIG[x264] = "--enable-libx264,--disable-libx264,x264"
+PACKAGECONFIG[xcb] = "--enable-libxcb,--disable-libxcb,libxcb"
+PACKAGECONFIG[xv] = "--enable-outdev=xv,--disable-outdev=xv,libxv"
+PACKAGECONFIG[zlib] = "--enable-zlib,--disable-zlib,zlib"
+PACKAGECONFIG[snappy] = "--enable-libsnappy,--disable-libsnappy,snappy"
+PACKAGECONFIG[udev] = "--enable-libudev,--disable-libudev,udev"
+PACKAGECONFIG[drm] = "--enable-libdrm,--disable-libdrm,libdrm"
+PACKAGECONFIG[epoxy] = "--enable-epoxy,--disable-epoxy,libepoxy"
+PACKAGECONFIG[v4l2] = "--enable-libv4l2 --enable-v4l2-m2m,,v4l-utils"
+PACKAGECONFIG[mmal] = "--enable-omx --enable-omx-rpi --enable-mmal,,userland"
+PACKAGECONFIG[sand] = "--enable-sand,,"
+PACKAGECONFIG[rpi] = "--enable-rpi,,"
+PACKAGECONFIG[vout-drm] = "--enable-vout-drm,,libdrm"
+PACKAGECONFIG[vout-egl] = "--enable-vout-egl,,virtual/egl"
+
+# other configuration options
+PACKAGECONFIG[mips32r2] = ",--disable-mipsdsp --disable-mipsdspr2"
+PACKAGECONFIG[pic] = "--enable-pic"
+PACKAGECONFIG[pthreads] = "--enable-pthreads,--disable-pthreads"
+PACKAGECONFIG[shared] = "--enable-shared"
+PACKAGECONFIG[strip] = ",--disable-stripping"
+
+# Check codecs that require --enable-nonfree
+USE_NONFREE = "${@bb.utils.contains_any('PACKAGECONFIG', [ 'openssl' ], 'yes', '', d)}"
+
+def cpu(d):
+ for arg in (d.getVar('TUNE_CCARGS') or '').split():
+ if arg.startswith('-mcpu='):
+ return arg[6:]
+ return 'generic'
+
+EXTRA_OECONF = " \
+ ${@bb.utils.contains('USE_NONFREE', 'yes', '--enable-nonfree', '', d)} \
+ \
+ --cross-prefix=${TARGET_PREFIX} \
+ \
+ --ld="${CCLD}" \
+ --cc="${CC}" \
+ --cxx="${CXX}" \
+ --arch=${TARGET_ARCH} \
+ --target-os="linux" \
+ --enable-cross-compile \
+ --extra-cflags="${CFLAGS} ${HOST_CC_ARCH}${TOOLCHAIN_OPTIONS}" \
+ --extra-ldflags="${LDFLAGS}" \
+ --sysroot="${STAGING_DIR_TARGET}" \
+ ${EXTRA_FFCONF} \
+ --libdir=${libdir} \
+ --shlibdir=${libdir} \
+ --datadir=${datadir}/ffmpeg \
+ --cpu=${@cpu(d)} \
+ --pkg-config=pkg-config \
+"
+EXTRA_OECONF:append:linux-gnux32 = " --disable-asm"
+
+# Some patches introduce assembly files which needs preprocessing with
+# gcc e.g. src/libavutil/aarch64/rpi_sand_neon.S
+TOOLCHAIN = "gcc"
+# gold crashes on x86, another solution is to --disable-asm but thats more hacky
+# ld.gold: internal error in relocate_section, at ../../gold/i386.cc:3684
+LDFLAGS:append:x86 = "${@bb.utils.contains('DISTRO_FEATURES', 'ld-is-gold', ' -fuse-ld=bfd ', '', d)}"
+EXTRA_OEMAKE = "V=1"
+
+do_configure() {
+ ${S}/configure ${EXTRA_OECONF}
+}
+
+# patch out build host paths for reproducibility
+do_compile:prepend:class-target() {
+ sed -i -e "s,${WORKDIR},,g" ${B}/config.h
+}
+
+PACKAGES =+ "libavcodec \
+ libavdevice \
+ libavfilter \
+ libavformat \
+ libavresample \
+ libavutil \
+ libpostproc \
+ libswresample \
+ libswscale"
+
+FILES:${PN}:append = " /usr/share/ffmpeg"
+FILES:libavcodec = "${libdir}/libavcodec${SOLIBS}"
+FILES:libavdevice = "${libdir}/libavdevice${SOLIBS}"
+FILES:libavfilter = "${libdir}/libavfilter${SOLIBS}"
+FILES:libavformat = "${libdir}/libavformat${SOLIBS}"
+FILES:libavresample = "${libdir}/libavresample${SOLIBS}"
+FILES:libavutil = "${libdir}/libavutil${SOLIBS}"
+FILES:libpostproc = "${libdir}/libpostproc${SOLIBS}"
+FILES:libswresample = "${libdir}/libswresample${SOLIBS}"
+FILES:libswscale = "${libdir}/libswscale${SOLIBS}"
+# ffmpeg disables PIC on some platforms (e.g. x86-32)
+INSANE_SKIP:${MLPREFIX}libavcodec = "textrel"
+INSANE_SKIP:${MLPREFIX}libavdevice = "textrel"
+INSANE_SKIP:${MLPREFIX}libavfilter = "textrel"
+INSANE_SKIP:${MLPREFIX}libavformat = "textrel"
+INSANE_SKIP:${MLPREFIX}libavutil = "textrel"
+INSANE_SKIP:${MLPREFIX}libavresample = "textrel"
+INSANE_SKIP:${MLPREFIX}libswscale = "textrel"
+INSANE_SKIP:${MLPREFIX}libswresample = "textrel"
+INSANE_SKIP:${MLPREFIX}libpostproc = "textrel"
+
+# Only enable it for rpi class of machines
+COMPATIBLE_HOST = "null"
+COMPATIBLE_HOST:rpi = "(.*)"
+
diff --git a/recipes-multimedia/x264/x264_%.bbappend b/recipes-multimedia/x264/x264_%.bbappend
index 505719e..e0cfaf8 100644
--- a/recipes-multimedia/x264/x264_%.bbappend
+++ b/recipes-multimedia/x264/x264_%.bbappend
@@ -1,2 +1,2 @@
-EXTRA_OECONF_append_raspberrypi = " --disable-asm"
-EXTRA_OECONF_append_raspberrypi0-wifi = " --disable-asm"
+EXTRA_OECONF:append:raspberrypi = " --disable-asm"
+EXTRA_OECONF:append:raspberrypi0-wifi = " --disable-asm"
diff --git a/recipes-sato/libwpe_%.bbappend b/recipes-sato/libwpe_%.bbappend
new file mode 100644
index 0000000..fe1e59b
--- /dev/null
+++ b/recipes-sato/libwpe_%.bbappend
@@ -0,0 +1,2 @@
+# Workaround build issue with RPi userland EGL libraries.
+CFLAGS:append:rpi = " ${@bb.utils.contains('MACHINE_FEATURES', 'vc4graphics', '', '-D_GNU_SOURCE', d)}"
diff --git a/wic/sdimage-raspberrypi.wks b/wic/sdimage-raspberrypi.wks
index 01fbaea..bb41e0f 100644
--- a/wic/sdimage-raspberrypi.wks
+++ b/wic/sdimage-raspberrypi.wks
@@ -2,5 +2,5 @@
# long-description: Creates a partitioned SD card image for use with
# Raspberry Pi. Boot files are located in the first vfat partition.
-part /boot --source bootimg-partition --ondisk mmcblk0 --fstype=vfat --label boot --active --align 4096 --size 20
+part /boot --source bootimg-partition --ondisk mmcblk0 --fstype=vfat --label boot --active --align 4096 --size 100
part / --source rootfs --ondisk mmcblk0 --fstype=ext4 --label root --align 4096