Index: Mk/Uses/gecko.mk =================================================================== --- Mk/Uses/gecko.mk +++ Mk/Uses/gecko.mk @@ -22,12 +22,12 @@ .if ${gecko_ARGS:Mfirefox} _GECKO_DEFAULT_VERSION= 60 -_GECKO_VERSIONS= 60 66 +_GECKO_VERSIONS= 60 67 _GECKO_TYPE= firefox # Dependence lines for different Firefox versions 60_DEPENDS= ${LOCALBASE}/lib/firefox/firefox:www/firefox-esr -66_DEPENDS= ${LOCALBASE}/lib/firefox/firefox:www/firefox +67_DEPENDS= ${LOCALBASE}/lib/firefox/firefox:www/firefox .if exists(${LOCALBASE}/bin/firefox) _GECKO_INSTALLED_VER!= ${PKG_QUERY} %v firefox firefox-esr Index: Mk/bsd.gecko.mk =================================================================== --- Mk/bsd.gecko.mk +++ Mk/bsd.gecko.mk @@ -356,7 +356,7 @@ STRIP= # ports/184285 .else MOZ_OPTIONS+= --disable-debug --disable-debug-symbols --enable-release -. if ${MOZILLA_VER:R:R} >= 67 && (${ARCH:Maarch64} || ${MACHINE_CPU:Msse2}) +. if ${MOZILLA_VER:R:R} >= 68 && (${ARCH:Maarch64} || ${MACHINE_CPU:Msse2}) MOZ_OPTIONS+= --enable-rust-simd . endif .endif Index: www/firefox-i18n/Makefile =================================================================== --- www/firefox-i18n/Makefile +++ www/firefox-i18n/Makefile @@ -2,7 +2,7 @@ # $FreeBSD$ PORTNAME= firefox-i18n -PORTVERSION= 66.0.1 +PORTVERSION= 67.0b5 CATEGORIES= www MASTER_SITES= MOZILLA/${PORTNAME:S|-i18n||}/releases/${DISTVERSION}/linux-x86_64/xpi \ MOZILLA/${PORTNAME:S|-i18n||}/candidates/${DISTVERSION}-candidates/build1/linux-x86_64/xpi @@ -15,7 +15,7 @@ EXTRACT_DEPENDS= zip:archivers/zip -USES= zip:infozip gecko:firefox,66,build +USES= zip:infozip gecko:firefox,67,build USE_XPI= firefox NO_ARCH= yes Index: www/firefox-i18n/distinfo =================================================================== --- www/firefox-i18n/distinfo +++ www/firefox-i18n/distinfo @@ -1,199 +1,199 @@ -TIMESTAMP = 1553231190 -SHA256 (xpi/firefox-i18n-66.0.1/ach.xpi) = a560d034cbcce29fc80f04229eaad4755c9613b2af9f50c882c9222e992b0b12 -SIZE (xpi/firefox-i18n-66.0.1/ach.xpi) = 454628 -SHA256 (xpi/firefox-i18n-66.0.1/af.xpi) = 88dee557f290ca53c25318716586c504c8b9a42ad0378709b85daa06eee0f3be -SIZE (xpi/firefox-i18n-66.0.1/af.xpi) = 447406 -SHA256 (xpi/firefox-i18n-66.0.1/an.xpi) = b20fdb6be0b49b58d46ead23a6a302b834d6da88e9eaeb9433ea55508a5bbca7 -SIZE (xpi/firefox-i18n-66.0.1/an.xpi) = 472356 -SHA256 (xpi/firefox-i18n-66.0.1/ar.xpi) = 15cd8b0e6fb56ea895ee513abc0c4e986d1301abe8938afc75b46086f4213d8c -SIZE (xpi/firefox-i18n-66.0.1/ar.xpi) = 516885 -SHA256 (xpi/firefox-i18n-66.0.1/as.xpi) = 924cf4a22b76bb7449d3551952662f796f91a30ab257aedea5370d8f86064ea7 -SIZE (xpi/firefox-i18n-66.0.1/as.xpi) = 479965 -SHA256 (xpi/firefox-i18n-66.0.1/ast.xpi) = 76fc2c3a5f55fb47fd8e69b7a6d0fd7cd099cc9c0851a01a8a11f7d2de9cea4d -SIZE (xpi/firefox-i18n-66.0.1/ast.xpi) = 471754 -SHA256 (xpi/firefox-i18n-66.0.1/az.xpi) = b588ed21b5e28cbf55e43e16b7cbce53f5783c7029f17f7a2b23f8d507dfb0e9 -SIZE (xpi/firefox-i18n-66.0.1/az.xpi) = 494096 -SHA256 (xpi/firefox-i18n-66.0.1/be.xpi) = 1f7d362f8b1f86b81972d35877ff7119e4ecc713898149723dffbb47dcb4cdc4 -SIZE (xpi/firefox-i18n-66.0.1/be.xpi) = 546377 -SHA256 (xpi/firefox-i18n-66.0.1/bg.xpi) = e102393b4ab2501d400d3c0d410df365d9a74a898a6bdd5e9443be63a4efae5c -SIZE (xpi/firefox-i18n-66.0.1/bg.xpi) = 527810 -SHA256 (xpi/firefox-i18n-66.0.1/bn-BD.xpi) = 28dc01ebbb5c5277360b836d1dc7db6bec0b4b84a4d215ab8f72db5f4509b0c7 -SIZE (xpi/firefox-i18n-66.0.1/bn-BD.xpi) = 547084 -SHA256 (xpi/firefox-i18n-66.0.1/bn-IN.xpi) = 5b430a666fc4e3d073f94cb0d507b31fb981059a64f3b89f542b4516ddb07921 -SIZE (xpi/firefox-i18n-66.0.1/bn-IN.xpi) = 537452 -SHA256 (xpi/firefox-i18n-66.0.1/br.xpi) = abf4b914c042113791580e4b661c96f7daaf41408ece69786e66556dfe8102d6 -SIZE (xpi/firefox-i18n-66.0.1/br.xpi) = 476665 -SHA256 (xpi/firefox-i18n-66.0.1/bs.xpi) = 60101d16dd0eec9d742165ed08f648b219f37ce8562d9ebb1030dfccdcda37b9 -SIZE (xpi/firefox-i18n-66.0.1/bs.xpi) = 469378 -SHA256 (xpi/firefox-i18n-66.0.1/ca.xpi) = 4e90eb06299a978b87893c71f28f7a678dde12b729b4e661a162341316854afc -SIZE (xpi/firefox-i18n-66.0.1/ca.xpi) = 477758 -SHA256 (xpi/firefox-i18n-66.0.1/cak.xpi) = 93e16948ded48f0abdbd28e16e496632f4dc8d9ebb0780d487e4d807b449177f -SIZE (xpi/firefox-i18n-66.0.1/cak.xpi) = 500359 -SHA256 (xpi/firefox-i18n-66.0.1/cs.xpi) = e2d007fb7acada2bc75fde170466bab528b675d953786beba0e4fea348ba00fe -SIZE (xpi/firefox-i18n-66.0.1/cs.xpi) = 490116 -SHA256 (xpi/firefox-i18n-66.0.1/cy.xpi) = 4d586f15ead5c2d61023293a7634b2d799edd3dd5eac02c55d6b1392066b0eb7 -SIZE (xpi/firefox-i18n-66.0.1/cy.xpi) = 479446 -SHA256 (xpi/firefox-i18n-66.0.1/da.xpi) = ca1ada6b301ecbc85a0b22a28fe2bccb1eae24ab5b0a6aa83865c52286247b15 -SIZE (xpi/firefox-i18n-66.0.1/da.xpi) = 489566 -SHA256 (xpi/firefox-i18n-66.0.1/de.xpi) = 3c62ab2e8492068ca499601e8ef663ff82db8d21c45363ebda11f4caa3c93958 -SIZE (xpi/firefox-i18n-66.0.1/de.xpi) = 488912 -SHA256 (xpi/firefox-i18n-66.0.1/dsb.xpi) = 4fc3bc5e7a4d9aa992b35a7366a381fc0fa29c7d25f0d6266f31bbfdfe4c96c1 -SIZE (xpi/firefox-i18n-66.0.1/dsb.xpi) = 499048 -SHA256 (xpi/firefox-i18n-66.0.1/el.xpi) = 67fe51ac80cd99714af0e4f5b48f4350e482ee4f0861e2ba5c8c0d283a9271f0 -SIZE (xpi/firefox-i18n-66.0.1/el.xpi) = 553957 -SHA256 (xpi/firefox-i18n-66.0.1/en-CA.xpi) = 346b1a175e89f944054d2d55232b723b4e0df0dd04d8710fb6b8c06f262965a1 -SIZE (xpi/firefox-i18n-66.0.1/en-CA.xpi) = 451460 -SHA256 (xpi/firefox-i18n-66.0.1/en-GB.xpi) = 8ecb7ef390a457c1972ee64add2e2f72ea2887070672f875da7f0414cb7dee44 -SIZE (xpi/firefox-i18n-66.0.1/en-GB.xpi) = 451500 -SHA256 (xpi/firefox-i18n-66.0.1/en-US.xpi) = 2071ebd3d577c9277748993c0700572b1fcbf21064f7bc7edcefb6a7ca499e8f -SIZE (xpi/firefox-i18n-66.0.1/en-US.xpi) = 422551 -SHA256 (xpi/firefox-i18n-66.0.1/en-ZA.xpi) = df850f8d2d356935bd0841cedbee9691092613a70e63e6f465328466d83ab85e -SIZE (xpi/firefox-i18n-66.0.1/en-ZA.xpi) = 430515 -SHA256 (xpi/firefox-i18n-66.0.1/eo.xpi) = d15e352228030031454b97ce4747375f727e899e5fd3b9f7458ec8de4a18ea14 -SIZE (xpi/firefox-i18n-66.0.1/eo.xpi) = 477839 -SHA256 (xpi/firefox-i18n-66.0.1/es-AR.xpi) = 04279eb87e8848c19c765f651df4c462f81c115bd13d5840035554386c3b4fc1 -SIZE (xpi/firefox-i18n-66.0.1/es-AR.xpi) = 486158 -SHA256 (xpi/firefox-i18n-66.0.1/es-CL.xpi) = ab39c252308b480daee669fa14ccdaccaf9a1d24085b087f409fbdb18ce7cfac -SIZE (xpi/firefox-i18n-66.0.1/es-CL.xpi) = 482902 -SHA256 (xpi/firefox-i18n-66.0.1/es-ES.xpi) = a685c79d92157c89d48c9f74160060acc1437b7307ea03f00e5f827dc0fb19ba -SIZE (xpi/firefox-i18n-66.0.1/es-ES.xpi) = 446824 -SHA256 (xpi/firefox-i18n-66.0.1/es-MX.xpi) = faa189f20af654bf1e1c53d9d31c2b9f8dc7dbf67882ada1033d249cb4b1cddd -SIZE (xpi/firefox-i18n-66.0.1/es-MX.xpi) = 487850 -SHA256 (xpi/firefox-i18n-66.0.1/et.xpi) = 41a97650489e2e9342c6dbf4c290e859d971f7fd72756afbcd0446b0ccd9217c -SIZE (xpi/firefox-i18n-66.0.1/et.xpi) = 468370 -SHA256 (xpi/firefox-i18n-66.0.1/eu.xpi) = c76846f94a2816f64a6b2fb826f94286b355b985b415d4970fa0b5b30fab7954 -SIZE (xpi/firefox-i18n-66.0.1/eu.xpi) = 476876 -SHA256 (xpi/firefox-i18n-66.0.1/fa.xpi) = 2e4952027a922d2f0c4d18ac236102cfeb6752d4590f6518cf129ed6da209dd5 -SIZE (xpi/firefox-i18n-66.0.1/fa.xpi) = 528479 -SHA256 (xpi/firefox-i18n-66.0.1/ff.xpi) = 9a60c2fc3f0563a7e61db9fe008dcfe4124b218d454fb37f539bee84abff1e75 -SIZE (xpi/firefox-i18n-66.0.1/ff.xpi) = 471588 -SHA256 (xpi/firefox-i18n-66.0.1/fi.xpi) = 078338e10c4c1bab56d6890530379fc9a60a53fcf8367c8d75ded0b1063cd4a8 -SIZE (xpi/firefox-i18n-66.0.1/fi.xpi) = 466498 -SHA256 (xpi/firefox-i18n-66.0.1/fr.xpi) = aed2de4d7f50ad68c6d59519daea01686eb3afc4759dedc858583d2c3914e60c -SIZE (xpi/firefox-i18n-66.0.1/fr.xpi) = 492347 -SHA256 (xpi/firefox-i18n-66.0.1/fy-NL.xpi) = aa2ecddcbf59b5e43e14b9e70c30ebe7838d5d257c012ca09046bf75d08883eb -SIZE (xpi/firefox-i18n-66.0.1/fy-NL.xpi) = 483161 -SHA256 (xpi/firefox-i18n-66.0.1/ga-IE.xpi) = 56143e298af8550b040a5d5eebb76229b1b0afaa85da4192a750089ec9884f93 -SIZE (xpi/firefox-i18n-66.0.1/ga-IE.xpi) = 476908 -SHA256 (xpi/firefox-i18n-66.0.1/gd.xpi) = db9659fae23fc47b9acac6db7fa2f567a903b9193cd20024edb40033a08dfdc0 -SIZE (xpi/firefox-i18n-66.0.1/gd.xpi) = 486090 -SHA256 (xpi/firefox-i18n-66.0.1/gl.xpi) = f7ac1e43ffb3ebe64a5eac609064f53e1fc034c4c5a9fdfcf0ff15d613bc570f -SIZE (xpi/firefox-i18n-66.0.1/gl.xpi) = 470770 -SHA256 (xpi/firefox-i18n-66.0.1/gn.xpi) = 320c3740b2a7292ae67831d288372261cc458eb3da82c0799e09d6aa2f2e0044 -SIZE (xpi/firefox-i18n-66.0.1/gn.xpi) = 492859 -SHA256 (xpi/firefox-i18n-66.0.1/gu-IN.xpi) = 1ab314f62b2a9a0d6801b6de5a57558b90638c0eac0347c5a2bc79fd0f76fda4 -SIZE (xpi/firefox-i18n-66.0.1/gu-IN.xpi) = 552724 -SHA256 (xpi/firefox-i18n-66.0.1/he.xpi) = 3bd60fe38a1f016156c3ab7512c19391ecd454d8f125b4c35b7db21e88f9d074 -SIZE (xpi/firefox-i18n-66.0.1/he.xpi) = 491409 -SHA256 (xpi/firefox-i18n-66.0.1/hi-IN.xpi) = 6622fbb8956951c620c4ad1c4ba5379b230b974bf2ab28cfadb2f9f7563bd0ab -SIZE (xpi/firefox-i18n-66.0.1/hi-IN.xpi) = 538513 -SHA256 (xpi/firefox-i18n-66.0.1/hr.xpi) = 1df04a3b16dc96cecccb4652d2b19155a1ec6505ef51ff4bc6fab88134a49ca8 -SIZE (xpi/firefox-i18n-66.0.1/hr.xpi) = 475787 -SHA256 (xpi/firefox-i18n-66.0.1/hsb.xpi) = 2392bcf1ed038a84b70dc78f9ab81ca4ee3b9b7c981b33dfe438f5b17c390952 -SIZE (xpi/firefox-i18n-66.0.1/hsb.xpi) = 498551 -SHA256 (xpi/firefox-i18n-66.0.1/hu.xpi) = 15338ffb8d7082d7242d99a73ce749f759f24d67df8a8e19ac772dcd4247a9bb -SIZE (xpi/firefox-i18n-66.0.1/hu.xpi) = 495665 -SHA256 (xpi/firefox-i18n-66.0.1/hy-AM.xpi) = 3e04628da3f544cf9d40f2c97a3feabdbd489ad8583d23a4c8687fdec57928b4 -SIZE (xpi/firefox-i18n-66.0.1/hy-AM.xpi) = 520694 -SHA256 (xpi/firefox-i18n-66.0.1/ia.xpi) = 868b1f1d51aa204e1b6280b5e6e6c03a67134d4a9ec648d517a7156edbba74fe -SIZE (xpi/firefox-i18n-66.0.1/ia.xpi) = 472789 -SHA256 (xpi/firefox-i18n-66.0.1/id.xpi) = dc59ce70eed64763cc223f91a332fd9810dad0081c37319225a002eb6211f26a -SIZE (xpi/firefox-i18n-66.0.1/id.xpi) = 465875 -SHA256 (xpi/firefox-i18n-66.0.1/is.xpi) = 6c00eb006c1655338730cdced13610bf41e0290ab3fcade4fd13215058e0de16 -SIZE (xpi/firefox-i18n-66.0.1/is.xpi) = 477149 -SHA256 (xpi/firefox-i18n-66.0.1/it.xpi) = 2b7ece363b25f1996532ff736c86beedfd4540f399b887c19652bd37b3b88a99 -SIZE (xpi/firefox-i18n-66.0.1/it.xpi) = 365175 -SHA256 (xpi/firefox-i18n-66.0.1/ja.xpi) = f4828b495f8c759eae589fdb41e4ac2b9754ea4aff9c9001cd53d99a4dc44f7d -SIZE (xpi/firefox-i18n-66.0.1/ja.xpi) = 506345 -SHA256 (xpi/firefox-i18n-66.0.1/ka.xpi) = 0835f3325ac978e0f0d1edf2e1fdc911aeb500e152612d67aa03fc212157653b -SIZE (xpi/firefox-i18n-66.0.1/ka.xpi) = 517439 -SHA256 (xpi/firefox-i18n-66.0.1/kab.xpi) = 9fef60374093c330802d78f92840ed88d46d742b58ae09751f13b2c3dc0d49dc -SIZE (xpi/firefox-i18n-66.0.1/kab.xpi) = 488671 -SHA256 (xpi/firefox-i18n-66.0.1/kk.xpi) = 7e58e76d0ba621d8d330376d3e4c9677328914597cb11d0e9bb53677b889fbda -SIZE (xpi/firefox-i18n-66.0.1/kk.xpi) = 543750 -SHA256 (xpi/firefox-i18n-66.0.1/km.xpi) = 063198be88f0a99cc79449e625d427d3a509c7780f1aab0626849756a89a64a8 -SIZE (xpi/firefox-i18n-66.0.1/km.xpi) = 534615 -SHA256 (xpi/firefox-i18n-66.0.1/kn.xpi) = a61bd0aef75ea4f0740106c757c73e8d78ec268273c9cca0c3c1d4e691210871 -SIZE (xpi/firefox-i18n-66.0.1/kn.xpi) = 534283 -SHA256 (xpi/firefox-i18n-66.0.1/ko.xpi) = 06806fba771de785e013434b957c170e08cec9179f3309680e7206a1b00443d2 -SIZE (xpi/firefox-i18n-66.0.1/ko.xpi) = 499376 -SHA256 (xpi/firefox-i18n-66.0.1/lij.xpi) = 17e7581d72539e1319b7d87ec2c650ea4343318eb0e2aa9ffc1a496a70fb3fe9 -SIZE (xpi/firefox-i18n-66.0.1/lij.xpi) = 472488 -SHA256 (xpi/firefox-i18n-66.0.1/lt.xpi) = de50aec410e7a0209b99c75d37c4575deb0527bfae5e3ea08150be638f5d0437 -SIZE (xpi/firefox-i18n-66.0.1/lt.xpi) = 498671 -SHA256 (xpi/firefox-i18n-66.0.1/lv.xpi) = 32b7542a61f87136d46f82c49d028846461835fe63e63a9b287a7f388e7bef3e -SIZE (xpi/firefox-i18n-66.0.1/lv.xpi) = 487166 -SHA256 (xpi/firefox-i18n-66.0.1/mai.xpi) = ef73946c7b5cf59336d18f49b1f98a572eee3f6371e36445425217379be6e06c -SIZE (xpi/firefox-i18n-66.0.1/mai.xpi) = 496836 -SHA256 (xpi/firefox-i18n-66.0.1/mk.xpi) = a1ced949c8936f5d3410bb09243ed036d9312636594526cb2391922af4c29b5d -SIZE (xpi/firefox-i18n-66.0.1/mk.xpi) = 474269 -SHA256 (xpi/firefox-i18n-66.0.1/ml.xpi) = 8a3ea36e643992cf6be0ae843fcf10244ce3632e8894d78360c0ed6147381451 -SIZE (xpi/firefox-i18n-66.0.1/ml.xpi) = 542465 -SHA256 (xpi/firefox-i18n-66.0.1/mr.xpi) = 870fbbf76bcd1964dfe1ba7b7b09d8220781fcfbac712cd4b81fe64ff2891b08 -SIZE (xpi/firefox-i18n-66.0.1/mr.xpi) = 533902 -SHA256 (xpi/firefox-i18n-66.0.1/ms.xpi) = de67289f6fd989a056c7f6ec66d4557d149955ee913d2c6fdeb4291ff755a1b8 -SIZE (xpi/firefox-i18n-66.0.1/ms.xpi) = 467764 -SHA256 (xpi/firefox-i18n-66.0.1/my.xpi) = 18876f36aa147b32fc3fa7eaf25308649cb911f0f1498d743370a3df5586d78f -SIZE (xpi/firefox-i18n-66.0.1/my.xpi) = 539900 -SHA256 (xpi/firefox-i18n-66.0.1/nb-NO.xpi) = aa4f42ad06ccfddc38bef48cac6bf4193686f8ad77cb8d7df3c96072e7400025 -SIZE (xpi/firefox-i18n-66.0.1/nb-NO.xpi) = 470494 -SHA256 (xpi/firefox-i18n-66.0.1/ne-NP.xpi) = a21cd5b5f1aa87ecac030c4a4416379f513e2b18dbf59edf3c4d49a748c9a74b -SIZE (xpi/firefox-i18n-66.0.1/ne-NP.xpi) = 511133 -SHA256 (xpi/firefox-i18n-66.0.1/nl.xpi) = 087adbeccb912afa732060737ff6fc5451c7f700cd1ade55df3138c43096db53 -SIZE (xpi/firefox-i18n-66.0.1/nl.xpi) = 463128 -SHA256 (xpi/firefox-i18n-66.0.1/nn-NO.xpi) = ac5915e690ea42179f89bbf4f8b85e061f8ff3e55fa1b313bd202b2692f90c1f -SIZE (xpi/firefox-i18n-66.0.1/nn-NO.xpi) = 469590 -SHA256 (xpi/firefox-i18n-66.0.1/oc.xpi) = 5020bd1934780ca0678009f731806fc69cbba1cf3298491cffd393cd94564ca0 -SIZE (xpi/firefox-i18n-66.0.1/oc.xpi) = 483097 -SHA256 (xpi/firefox-i18n-66.0.1/or.xpi) = bd8973f610a2ca0487be1c51b72aef7dd7bdcd31f70752f3c18d0d7fdcc5e06a -SIZE (xpi/firefox-i18n-66.0.1/or.xpi) = 486629 -SHA256 (xpi/firefox-i18n-66.0.1/pa-IN.xpi) = 3c552e0834191c7cd8b52ded75cdd1a0a36cc9cd29e965bd9e12488b1de744f9 -SIZE (xpi/firefox-i18n-66.0.1/pa-IN.xpi) = 521431 -SHA256 (xpi/firefox-i18n-66.0.1/pl.xpi) = 0222f578bcb4e789f6f6fb316cb86254acae3005a94ec4b1ebb0c1ed45a01a59 -SIZE (xpi/firefox-i18n-66.0.1/pl.xpi) = 454401 -SHA256 (xpi/firefox-i18n-66.0.1/pt-BR.xpi) = 40b5facc71e0977984c2b18a5b3bf900a0dd374759d883a6a256b0621690db8e -SIZE (xpi/firefox-i18n-66.0.1/pt-BR.xpi) = 472204 -SHA256 (xpi/firefox-i18n-66.0.1/pt-PT.xpi) = 6eb09307b20aa549d93c0332f88e55c991338c0a4fd579309a65ee26eda37f5b -SIZE (xpi/firefox-i18n-66.0.1/pt-PT.xpi) = 482242 -SHA256 (xpi/firefox-i18n-66.0.1/rm.xpi) = 1a26257cf9731e58925adbe272634c255dc07cbb4eccb039ca29613b668e951f -SIZE (xpi/firefox-i18n-66.0.1/rm.xpi) = 475006 -SHA256 (xpi/firefox-i18n-66.0.1/ro.xpi) = 710a746127931271f2865ee564a3685d28ef2dfe0f83e043a8529057e09ce834 -SIZE (xpi/firefox-i18n-66.0.1/ro.xpi) = 481033 -SHA256 (xpi/firefox-i18n-66.0.1/ru.xpi) = 75eac3ce101b903bc8d81e226f84e3dc1f6f906b59c1e26d06cb163acd0fff11 -SIZE (xpi/firefox-i18n-66.0.1/ru.xpi) = 551343 -SHA256 (xpi/firefox-i18n-66.0.1/si.xpi) = 60000ae7f36dfbe0ae6851b7c63264f9ee98280473e98b7a48f4501543e9f891 -SIZE (xpi/firefox-i18n-66.0.1/si.xpi) = 513782 -SHA256 (xpi/firefox-i18n-66.0.1/sk.xpi) = aad3fe09938af2777399766459faeffb18c29fa584bb04433a7d9f8f06f35f77 -SIZE (xpi/firefox-i18n-66.0.1/sk.xpi) = 500049 -SHA256 (xpi/firefox-i18n-66.0.1/sl.xpi) = 78adaef48eeb26a57f3887f61b149c0ee46f40b58be3543221b4d78ab8892593 -SIZE (xpi/firefox-i18n-66.0.1/sl.xpi) = 475587 -SHA256 (xpi/firefox-i18n-66.0.1/son.xpi) = 897a71d22d074ae5a4fd1a74cb223668d40feb2299748191650bef280a16fb89 -SIZE (xpi/firefox-i18n-66.0.1/son.xpi) = 451843 -SHA256 (xpi/firefox-i18n-66.0.1/sq.xpi) = 81873eb4ef5dec7b995f02132b55c7c6b337346745ea8032798da585265fb737 -SIZE (xpi/firefox-i18n-66.0.1/sq.xpi) = 491733 -SHA256 (xpi/firefox-i18n-66.0.1/sr.xpi) = 52cea818b337254788ba51141e2630bbc7babacc8c5ea12ca0d88ceb34c24d65 -SIZE (xpi/firefox-i18n-66.0.1/sr.xpi) = 513411 -SHA256 (xpi/firefox-i18n-66.0.1/sv-SE.xpi) = e546d7ff02fe6ad6656545222f78e2ee1eb25bd052bc5ba4ab7b2689049135a9 -SIZE (xpi/firefox-i18n-66.0.1/sv-SE.xpi) = 479573 -SHA256 (xpi/firefox-i18n-66.0.1/ta.xpi) = 22aac38c33987e3cfc8fdca2b563a0ddb3d5dfc18e2e941c9a62d4b36852ace0 -SIZE (xpi/firefox-i18n-66.0.1/ta.xpi) = 534135 -SHA256 (xpi/firefox-i18n-66.0.1/te.xpi) = 36a09137665e6a59729897f82dfefcd46d1a8ab9cd2fc106edd2e14a9f9f998f -SIZE (xpi/firefox-i18n-66.0.1/te.xpi) = 540399 -SHA256 (xpi/firefox-i18n-66.0.1/th.xpi) = 79477c65dc5351f9ab617b603b1de72d5779b3dbb5373632bbe82405b4a2c180 -SIZE (xpi/firefox-i18n-66.0.1/th.xpi) = 518910 -SHA256 (xpi/firefox-i18n-66.0.1/tr.xpi) = 44a7eb2b4ed0a6387d9cd3702c51e9331882b02d46623f55e37f57730640c4c8 -SIZE (xpi/firefox-i18n-66.0.1/tr.xpi) = 489584 -SHA256 (xpi/firefox-i18n-66.0.1/uk.xpi) = 70994b10619b59071107c2f231ef7a8025c15c48523a66f8c3414ac3fa38887e -SIZE (xpi/firefox-i18n-66.0.1/uk.xpi) = 530132 -SHA256 (xpi/firefox-i18n-66.0.1/ur.xpi) = cbf845cb88576686a09b5ea99151dd89ed3639b50668609fdda7bbb243828be5 -SIZE (xpi/firefox-i18n-66.0.1/ur.xpi) = 519002 -SHA256 (xpi/firefox-i18n-66.0.1/uz.xpi) = adefa560ee37c53f593fc014a2136c37a1285544c59f30456feef6b81e934bc3 -SIZE (xpi/firefox-i18n-66.0.1/uz.xpi) = 471553 -SHA256 (xpi/firefox-i18n-66.0.1/vi.xpi) = 07b19bee5c4fc6555bf4951cc02dd8ee53f93c1d52ca57c2d6196eead82515bf -SIZE (xpi/firefox-i18n-66.0.1/vi.xpi) = 495119 -SHA256 (xpi/firefox-i18n-66.0.1/xh.xpi) = 4ed78ae348f4a228fc3a7f7b69d7e75861627fd146c344b901692a471874628a -SIZE (xpi/firefox-i18n-66.0.1/xh.xpi) = 463578 -SHA256 (xpi/firefox-i18n-66.0.1/zh-CN.xpi) = 152f68c466d0099b7ea7754b4c984e81c3629ea6e20e9475d6c3da1be81e6003 -SIZE (xpi/firefox-i18n-66.0.1/zh-CN.xpi) = 502889 -SHA256 (xpi/firefox-i18n-66.0.1/zh-TW.xpi) = 0172ea1cf2ffc935256b7b42f99f3d6d9a1d8b92296cba83774d8a277fd949dc -SIZE (xpi/firefox-i18n-66.0.1/zh-TW.xpi) = 500702 +TIMESTAMP = 1553532864 +SHA256 (xpi/firefox-i18n-67.0b5/ach.xpi) = bd5646174b28986b11caf9dabb1d86099d57eb4fd0443f42ec4def307fed379e +SIZE (xpi/firefox-i18n-67.0b5/ach.xpi) = 452085 +SHA256 (xpi/firefox-i18n-67.0b5/af.xpi) = 1fa615b29afd3f0d68f2295ee5b7d1e58f109bcca568aa70c04171ca3c6a3d12 +SIZE (xpi/firefox-i18n-67.0b5/af.xpi) = 441359 +SHA256 (xpi/firefox-i18n-67.0b5/an.xpi) = 093f11160b8de8dbebbcab01018e7eead51a7e36ae68b52911c2ea80427265af +SIZE (xpi/firefox-i18n-67.0b5/an.xpi) = 468578 +SHA256 (xpi/firefox-i18n-67.0b5/ar.xpi) = 7412571ac01394e13b63178dc8bab17d3d8a40ae5245df7f72aca1a9d1e71a5d +SIZE (xpi/firefox-i18n-67.0b5/ar.xpi) = 513448 +SHA256 (xpi/firefox-i18n-67.0b5/as.xpi) = 8eb6f7d7a4aec65b8f47f102f3797ef2ff373c9bc215ac201f9420e2e1a10fc0 +SIZE (xpi/firefox-i18n-67.0b5/as.xpi) = 474594 +SHA256 (xpi/firefox-i18n-67.0b5/ast.xpi) = ac4517d13c59904f6efe4850e1a995bf76cf0a0d05ff11c41bc80e2637089f22 +SIZE (xpi/firefox-i18n-67.0b5/ast.xpi) = 467789 +SHA256 (xpi/firefox-i18n-67.0b5/az.xpi) = 036a3ca92e6526529e86d0ed756b6974667374b5580f5dcaf8d8e3b1b71ce2e3 +SIZE (xpi/firefox-i18n-67.0b5/az.xpi) = 489604 +SHA256 (xpi/firefox-i18n-67.0b5/be.xpi) = b8d0365824507723d6c57fbbbaa5ebcfe2fb0a9836d7f6db9e32c445b67226e9 +SIZE (xpi/firefox-i18n-67.0b5/be.xpi) = 539673 +SHA256 (xpi/firefox-i18n-67.0b5/bg.xpi) = 9621b1e8254afab12d1c21f50481c3b67c7af405584155f45545107c9cdb6434 +SIZE (xpi/firefox-i18n-67.0b5/bg.xpi) = 524424 +SHA256 (xpi/firefox-i18n-67.0b5/bn-BD.xpi) = 656b2af77085601951f8dda01d939ce435ec9e0d1044e3014d6d083555cfc221 +SIZE (xpi/firefox-i18n-67.0b5/bn-BD.xpi) = 541631 +SHA256 (xpi/firefox-i18n-67.0b5/bn-IN.xpi) = 0907f227fd6da6af67bb3adea7aa1ac8227463b5952d472d9290dab4f532494d +SIZE (xpi/firefox-i18n-67.0b5/bn-IN.xpi) = 533000 +SHA256 (xpi/firefox-i18n-67.0b5/br.xpi) = 5a3d6ce0b1d0ddd34ff2a35a5e3b8d9d9e8e0d6202af0fce4122a7f78d73acf2 +SIZE (xpi/firefox-i18n-67.0b5/br.xpi) = 472517 +SHA256 (xpi/firefox-i18n-67.0b5/bs.xpi) = 37a4eb5c98f0c0d1423037e0017e70020957dc2b1119370a374619dc7060691c +SIZE (xpi/firefox-i18n-67.0b5/bs.xpi) = 465669 +SHA256 (xpi/firefox-i18n-67.0b5/ca.xpi) = d0d1f55562c94f13f272c57799fbaf66354df40a82652b6755cef0e0d068e007 +SIZE (xpi/firefox-i18n-67.0b5/ca.xpi) = 475199 +SHA256 (xpi/firefox-i18n-67.0b5/cak.xpi) = d3f7523a4a757907a371a2df56484d164108c6cc221c6a8810aab4c2eb07f200 +SIZE (xpi/firefox-i18n-67.0b5/cak.xpi) = 496777 +SHA256 (xpi/firefox-i18n-67.0b5/cs.xpi) = 50493a02c904fef3e60155c4ca64bf5e6c4ffd7b9bb4c85399c80f59a34c87f9 +SIZE (xpi/firefox-i18n-67.0b5/cs.xpi) = 486172 +SHA256 (xpi/firefox-i18n-67.0b5/cy.xpi) = 93153d57661a8ff83efa4265ddf81d74ed68943ed59b42b8c941344a365ec9cd +SIZE (xpi/firefox-i18n-67.0b5/cy.xpi) = 479405 +SHA256 (xpi/firefox-i18n-67.0b5/da.xpi) = 00296559f064cf64104cf61df078061c4bee39f269a4f78ec197b7c2d5b806ff +SIZE (xpi/firefox-i18n-67.0b5/da.xpi) = 484642 +SHA256 (xpi/firefox-i18n-67.0b5/de.xpi) = 3031350237a9e73d18e9d4fe7105ff31d0d0236cb61cb866c659bb2beb34579f +SIZE (xpi/firefox-i18n-67.0b5/de.xpi) = 486230 +SHA256 (xpi/firefox-i18n-67.0b5/dsb.xpi) = d1b298db9688b0f8a8ffa8f7f23b6a587078b644b25f59d9b656b22162aa39f4 +SIZE (xpi/firefox-i18n-67.0b5/dsb.xpi) = 493502 +SHA256 (xpi/firefox-i18n-67.0b5/el.xpi) = b84f4a906127efb4ae9d78fc8c989714f5f848332e3c49d92e694b4333e29c37 +SIZE (xpi/firefox-i18n-67.0b5/el.xpi) = 550051 +SHA256 (xpi/firefox-i18n-67.0b5/en-CA.xpi) = a480870c51ec7a46764284aa97747798ce31255b246f81748c5a2702e495e87d +SIZE (xpi/firefox-i18n-67.0b5/en-CA.xpi) = 451358 +SHA256 (xpi/firefox-i18n-67.0b5/en-GB.xpi) = fee1a11aff56656f517e1618b513349a56fda6a96db11e7e0d358a772666494d +SIZE (xpi/firefox-i18n-67.0b5/en-GB.xpi) = 446655 +SHA256 (xpi/firefox-i18n-67.0b5/en-US.xpi) = 0bbf4a1998ec22a189ce3c7c465d898ef2f664620bb3b1b0282bdfd7bfcb93f2 +SIZE (xpi/firefox-i18n-67.0b5/en-US.xpi) = 425741 +SHA256 (xpi/firefox-i18n-67.0b5/en-ZA.xpi) = 86e38c333ac8b3a98f4b4b43723d8f714f6ec66fee90e579b529183b1b84a783 +SIZE (xpi/firefox-i18n-67.0b5/en-ZA.xpi) = 428510 +SHA256 (xpi/firefox-i18n-67.0b5/eo.xpi) = 77fade2c84d0786011b99e3b580839e610d3d40bdf276ca075510daff3165885 +SIZE (xpi/firefox-i18n-67.0b5/eo.xpi) = 472439 +SHA256 (xpi/firefox-i18n-67.0b5/es-AR.xpi) = 785e7c45552c34c679f87ea7f9ed740b0948269e11a1e6190b097d892b959826 +SIZE (xpi/firefox-i18n-67.0b5/es-AR.xpi) = 484093 +SHA256 (xpi/firefox-i18n-67.0b5/es-CL.xpi) = b8ab5ce17ae2d6019ddfb82778cd21f9b608e7685f10ac39e2992c887ef158e8 +SIZE (xpi/firefox-i18n-67.0b5/es-CL.xpi) = 479263 +SHA256 (xpi/firefox-i18n-67.0b5/es-ES.xpi) = b54fa9aad140a72e92cd622ab4d4892b07b62a4fe42a39b1bd3cd7b35d582373 +SIZE (xpi/firefox-i18n-67.0b5/es-ES.xpi) = 448615 +SHA256 (xpi/firefox-i18n-67.0b5/es-MX.xpi) = 7f7e8bc5fab155da3b89b958bfb20e89a49fea619013c4dad341960c42d7681c +SIZE (xpi/firefox-i18n-67.0b5/es-MX.xpi) = 484691 +SHA256 (xpi/firefox-i18n-67.0b5/et.xpi) = a45c64c4676ba94f3eae93a722a1256f99bf56ef8f1720f1935522888dbbd649 +SIZE (xpi/firefox-i18n-67.0b5/et.xpi) = 465016 +SHA256 (xpi/firefox-i18n-67.0b5/eu.xpi) = 053ecb264416c89cbe60879680f9621a051752de31e4653f7d985e7cce5576c4 +SIZE (xpi/firefox-i18n-67.0b5/eu.xpi) = 471613 +SHA256 (xpi/firefox-i18n-67.0b5/fa.xpi) = 087e684d139ca6c428f8fb3c1cd482dbec3b3b96efc062fd00d9bc2a91ea3462 +SIZE (xpi/firefox-i18n-67.0b5/fa.xpi) = 523997 +SHA256 (xpi/firefox-i18n-67.0b5/ff.xpi) = 3b1b4bd6cda781778db5ff36288641ffc46155c5cd5463e7b0efd534acec04e4 +SIZE (xpi/firefox-i18n-67.0b5/ff.xpi) = 468331 +SHA256 (xpi/firefox-i18n-67.0b5/fi.xpi) = b47825a63b5b4de5015b1223b7782e2d4073861425a0a2345a3534cbfcc4cd85 +SIZE (xpi/firefox-i18n-67.0b5/fi.xpi) = 461417 +SHA256 (xpi/firefox-i18n-67.0b5/fr.xpi) = 30d0f23a4802375ef66f5116231b3fef8af5d7d91aa4f160832c66a98d0f7808 +SIZE (xpi/firefox-i18n-67.0b5/fr.xpi) = 489805 +SHA256 (xpi/firefox-i18n-67.0b5/fy-NL.xpi) = b2c078d6f55b17f3eda9a22b7f68ff37fca384dc42c2a8f30a0b4d60915d5abc +SIZE (xpi/firefox-i18n-67.0b5/fy-NL.xpi) = 477868 +SHA256 (xpi/firefox-i18n-67.0b5/ga-IE.xpi) = 9e15ba68376ebb6303c4d53a5dfb95f1add498b2dbcb5a6e75379d7133d9b8fc +SIZE (xpi/firefox-i18n-67.0b5/ga-IE.xpi) = 472896 +SHA256 (xpi/firefox-i18n-67.0b5/gd.xpi) = be71e055c0d1b69be66d6e4d2a552a92215be2a268ebda46841a7be9dcaefa0c +SIZE (xpi/firefox-i18n-67.0b5/gd.xpi) = 482156 +SHA256 (xpi/firefox-i18n-67.0b5/gl.xpi) = b28f86ccf422faa97660f16313fda7492524198da255577258fec0de2cdd7459 +SIZE (xpi/firefox-i18n-67.0b5/gl.xpi) = 467936 +SHA256 (xpi/firefox-i18n-67.0b5/gn.xpi) = 39f76e211096ed7e2c34edf878a22138f680344021d53e0a89cf57e43a5afb1a +SIZE (xpi/firefox-i18n-67.0b5/gn.xpi) = 492271 +SHA256 (xpi/firefox-i18n-67.0b5/gu-IN.xpi) = 730c0c882024752cfe312c922dce80eaed6e84e5bf110f7d55a01fc12b11330c +SIZE (xpi/firefox-i18n-67.0b5/gu-IN.xpi) = 546947 +SHA256 (xpi/firefox-i18n-67.0b5/he.xpi) = b84b3f3a2a4bb718c65fe3c24b5edb514689be777ea4d0147c14c21f6c429907 +SIZE (xpi/firefox-i18n-67.0b5/he.xpi) = 488213 +SHA256 (xpi/firefox-i18n-67.0b5/hi-IN.xpi) = 00d6b83798d3f99548715b212841e58bca3d7d1bb6e0e7d8143e5f8f356f9195 +SIZE (xpi/firefox-i18n-67.0b5/hi-IN.xpi) = 534367 +SHA256 (xpi/firefox-i18n-67.0b5/hr.xpi) = f814122e20e5c45818e23509332b7ae74c6c3d2491bbfcf4e97731937d94e8b6 +SIZE (xpi/firefox-i18n-67.0b5/hr.xpi) = 472506 +SHA256 (xpi/firefox-i18n-67.0b5/hsb.xpi) = 0393bb8bbae83ab4525be97667482f6fbd1962f50fdb8a7102b79b2300d2a3f1 +SIZE (xpi/firefox-i18n-67.0b5/hsb.xpi) = 496982 +SHA256 (xpi/firefox-i18n-67.0b5/hu.xpi) = 058d260332c46d26612c590092e9b64cee34d60868008c0ba8aa1e83f5a11da5 +SIZE (xpi/firefox-i18n-67.0b5/hu.xpi) = 494343 +SHA256 (xpi/firefox-i18n-67.0b5/hy-AM.xpi) = a596a602e7c139b3ce6d38095002c0550a56a79305010247019bf08d7bd9ae11 +SIZE (xpi/firefox-i18n-67.0b5/hy-AM.xpi) = 515962 +SHA256 (xpi/firefox-i18n-67.0b5/ia.xpi) = 547c084cda04d7381631db513ab3d8c98df994c4d139c77010f8e42933a7fce3 +SIZE (xpi/firefox-i18n-67.0b5/ia.xpi) = 473370 +SHA256 (xpi/firefox-i18n-67.0b5/id.xpi) = e93ab59b40045656916a6acd4c434b0d31249fbf2d4c18024270029c52923f85 +SIZE (xpi/firefox-i18n-67.0b5/id.xpi) = 467041 +SHA256 (xpi/firefox-i18n-67.0b5/is.xpi) = 67b277e6e9eb70d19deb32160d7482a4a56de9c849621a20e41756479310f146 +SIZE (xpi/firefox-i18n-67.0b5/is.xpi) = 472665 +SHA256 (xpi/firefox-i18n-67.0b5/it.xpi) = 81be6a7c21d6a61980b6951d26d76efcddcc104b8a59def6fd34c61c360bf47c +SIZE (xpi/firefox-i18n-67.0b5/it.xpi) = 362508 +SHA256 (xpi/firefox-i18n-67.0b5/ja.xpi) = de8b423c97363d2aacffd1126b3a6a3f7abaddb90e8873ccc80645166b7d7638 +SIZE (xpi/firefox-i18n-67.0b5/ja.xpi) = 501695 +SHA256 (xpi/firefox-i18n-67.0b5/ka.xpi) = bf7ef332ec7e2fcd4162af33d7dba560753dad179464914778b4b478f82fe0ad +SIZE (xpi/firefox-i18n-67.0b5/ka.xpi) = 515495 +SHA256 (xpi/firefox-i18n-67.0b5/kab.xpi) = f427769f67ee97891d041f840e9ee7fe23ae720bdd14acc14ea591ebc158f707 +SIZE (xpi/firefox-i18n-67.0b5/kab.xpi) = 485734 +SHA256 (xpi/firefox-i18n-67.0b5/kk.xpi) = d5aeb877c2527577a997c6209aa61037228326d3e1ac8617152bf77d11acfe93 +SIZE (xpi/firefox-i18n-67.0b5/kk.xpi) = 537686 +SHA256 (xpi/firefox-i18n-67.0b5/km.xpi) = 6c903122e184b22b37122368baa6d191d23b9ea588ca62d25356cee0767bdbbe +SIZE (xpi/firefox-i18n-67.0b5/km.xpi) = 529082 +SHA256 (xpi/firefox-i18n-67.0b5/kn.xpi) = 843c7bacf18c69750d4b23db0c3693429fba5fed67803873894e2e66f47410ee +SIZE (xpi/firefox-i18n-67.0b5/kn.xpi) = 528953 +SHA256 (xpi/firefox-i18n-67.0b5/ko.xpi) = d794e0bff13aa44e0868e94718623de6bfb4352e98836133dc885b2f5a33a1fc +SIZE (xpi/firefox-i18n-67.0b5/ko.xpi) = 494475 +SHA256 (xpi/firefox-i18n-67.0b5/lij.xpi) = 98eaea648c0460f8cc8612dcb914a6cf5815da3014afe7410de27fdb13329957 +SIZE (xpi/firefox-i18n-67.0b5/lij.xpi) = 461547 +SHA256 (xpi/firefox-i18n-67.0b5/lt.xpi) = 90409c49c07c12a3ea58140a1e04ff0817608b20b5602ad1e3df1b1f1afa9097 +SIZE (xpi/firefox-i18n-67.0b5/lt.xpi) = 498029 +SHA256 (xpi/firefox-i18n-67.0b5/lv.xpi) = e9104976c0ca5d2629620fc1dbab2736892e1254885a1ce6d6046317333cae95 +SIZE (xpi/firefox-i18n-67.0b5/lv.xpi) = 483404 +SHA256 (xpi/firefox-i18n-67.0b5/mai.xpi) = 9e55be02e850da7b9b9e9a0133062bdcf1f197912a4bd6ef9d22249f45e45ec6 +SIZE (xpi/firefox-i18n-67.0b5/mai.xpi) = 492610 +SHA256 (xpi/firefox-i18n-67.0b5/mk.xpi) = b32c8834c85134a03f45fbae60ad392a72e7b8e07f015cab619e3519666d4f60 +SIZE (xpi/firefox-i18n-67.0b5/mk.xpi) = 466697 +SHA256 (xpi/firefox-i18n-67.0b5/ml.xpi) = 0e77c5fede5e64a0386eb7e58a0a0608f2f3fdcba428fa86511186220c6321d3 +SIZE (xpi/firefox-i18n-67.0b5/ml.xpi) = 536110 +SHA256 (xpi/firefox-i18n-67.0b5/mr.xpi) = f70fcccc5f46842472b2e20e2a99967c98ac51fd3ede6e20c29addc0b8eee917 +SIZE (xpi/firefox-i18n-67.0b5/mr.xpi) = 532870 +SHA256 (xpi/firefox-i18n-67.0b5/ms.xpi) = 171a1df5c68ea52ac1b1961b97ce5c4b49c68f6f7bfa9d49bdbb1ef1f9f9fd05 +SIZE (xpi/firefox-i18n-67.0b5/ms.xpi) = 464300 +SHA256 (xpi/firefox-i18n-67.0b5/my.xpi) = cd3bec037f85e59296a729132c3d0ec09d35524f5a69c9b33a65f73f3b9d2d75 +SIZE (xpi/firefox-i18n-67.0b5/my.xpi) = 532259 +SHA256 (xpi/firefox-i18n-67.0b5/nb-NO.xpi) = 3ef99a73db45c8da64cd582fd58b40f6880e8d996261ed472b53b5e069992a02 +SIZE (xpi/firefox-i18n-67.0b5/nb-NO.xpi) = 465112 +SHA256 (xpi/firefox-i18n-67.0b5/ne-NP.xpi) = ab75dd6c6a531cd91b1930e7ba108fe54a3e83b2fa758128ff387d74f8d27d4e +SIZE (xpi/firefox-i18n-67.0b5/ne-NP.xpi) = 505193 +SHA256 (xpi/firefox-i18n-67.0b5/nl.xpi) = 1691c83046520510e0b9fa98cf8b20af1d60323a6939d71c8a2e3fc2e2e959e1 +SIZE (xpi/firefox-i18n-67.0b5/nl.xpi) = 462164 +SHA256 (xpi/firefox-i18n-67.0b5/nn-NO.xpi) = f2a9d37b3e507b9cf5073a50ae28794cfd41360b4b9f97feebba0cd232f2a302 +SIZE (xpi/firefox-i18n-67.0b5/nn-NO.xpi) = 469084 +SHA256 (xpi/firefox-i18n-67.0b5/oc.xpi) = 670f48ab285625c68be00c4ea79fe39c28c51f77599db1535aea25eda389353c +SIZE (xpi/firefox-i18n-67.0b5/oc.xpi) = 482224 +SHA256 (xpi/firefox-i18n-67.0b5/or.xpi) = 43b09d25b34d49ecabb066ab2dcac447dd38cf6d7deb99712c3bd53b447a6127 +SIZE (xpi/firefox-i18n-67.0b5/or.xpi) = 481074 +SHA256 (xpi/firefox-i18n-67.0b5/pa-IN.xpi) = ba7b96f646ec90fdf50a3e6991ec53894dc9b7895b52bbedfc12752b4799cee3 +SIZE (xpi/firefox-i18n-67.0b5/pa-IN.xpi) = 514730 +SHA256 (xpi/firefox-i18n-67.0b5/pl.xpi) = 9124ee16a011e8ee97586a7f31cdc557ac48388e2953fb54b2d77b8018b4da6a +SIZE (xpi/firefox-i18n-67.0b5/pl.xpi) = 451899 +SHA256 (xpi/firefox-i18n-67.0b5/pt-BR.xpi) = 0b834bb30a6cd02debcebd2f6b2e7f8cc1fc96c5242e99280710163eee772626 +SIZE (xpi/firefox-i18n-67.0b5/pt-BR.xpi) = 471155 +SHA256 (xpi/firefox-i18n-67.0b5/pt-PT.xpi) = 8535b304683e1adbb68a83ab11db0021d81068677ec1ce52ee07172a81e858b6 +SIZE (xpi/firefox-i18n-67.0b5/pt-PT.xpi) = 481612 +SHA256 (xpi/firefox-i18n-67.0b5/rm.xpi) = c25ed888c17c82d86cf1492026453123aca4a66750cd199747763ee7cd51f7c6 +SIZE (xpi/firefox-i18n-67.0b5/rm.xpi) = 469964 +SHA256 (xpi/firefox-i18n-67.0b5/ro.xpi) = 881de6c259777555af442647d9217bc1090695dd38cb9bf643231917998e20aa +SIZE (xpi/firefox-i18n-67.0b5/ro.xpi) = 475919 +SHA256 (xpi/firefox-i18n-67.0b5/ru.xpi) = 8c4afe96206c3b79e037708097e123fc094c3e23ba2ce1abbca579e74776e2c2 +SIZE (xpi/firefox-i18n-67.0b5/ru.xpi) = 549755 +SHA256 (xpi/firefox-i18n-67.0b5/si.xpi) = c76dbe1b94522abc9fddc2c1ca4825dacdf7c1f68699effd2eada312605e3943 +SIZE (xpi/firefox-i18n-67.0b5/si.xpi) = 509255 +SHA256 (xpi/firefox-i18n-67.0b5/sk.xpi) = e9eadd2d02aa211f79b179001f8237dab1b1dd6d6ccabc30e30aa76596e561f9 +SIZE (xpi/firefox-i18n-67.0b5/sk.xpi) = 497146 +SHA256 (xpi/firefox-i18n-67.0b5/sl.xpi) = a907b8c08a8e295c233758eae9d96354d92ce4dcbfd425901c5807061eee529c +SIZE (xpi/firefox-i18n-67.0b5/sl.xpi) = 470810 +SHA256 (xpi/firefox-i18n-67.0b5/son.xpi) = a4cddb3014cf9e4bf946a271b8b5401f5fd382aed2f94743f3b21259d606e831 +SIZE (xpi/firefox-i18n-67.0b5/son.xpi) = 448242 +SHA256 (xpi/firefox-i18n-67.0b5/sq.xpi) = 46043204f6249e8428a3a8d2dad76abe3645683625ae67de13eb19438879201b +SIZE (xpi/firefox-i18n-67.0b5/sq.xpi) = 486534 +SHA256 (xpi/firefox-i18n-67.0b5/sr.xpi) = 67146639df8dbffe244c61a922f3422fa7de7c4fa20e54345c9bcc4fd71ffbff +SIZE (xpi/firefox-i18n-67.0b5/sr.xpi) = 506043 +SHA256 (xpi/firefox-i18n-67.0b5/sv-SE.xpi) = efed20a6792e0606fddc44e0c5aa7e267d86dc55b6aabdb2b055e6cff69dbaa0 +SIZE (xpi/firefox-i18n-67.0b5/sv-SE.xpi) = 479143 +SHA256 (xpi/firefox-i18n-67.0b5/ta.xpi) = d59f885af965716ba2a0d0a447d5b47e4d98f779789257a0d26436767ef64365 +SIZE (xpi/firefox-i18n-67.0b5/ta.xpi) = 528458 +SHA256 (xpi/firefox-i18n-67.0b5/te.xpi) = f8e87277420c2737349055e55adb6f5c1549ad18628b91e4c5fbbaac022bd8d5 +SIZE (xpi/firefox-i18n-67.0b5/te.xpi) = 537680 +SHA256 (xpi/firefox-i18n-67.0b5/th.xpi) = 9c9461577b8dee90790052c988d8fef2377f110603856c9add0904576da45d22 +SIZE (xpi/firefox-i18n-67.0b5/th.xpi) = 515107 +SHA256 (xpi/firefox-i18n-67.0b5/tr.xpi) = c01f67a29a13dac6387744802bd2c64cd5e5a91b3769cbd41d55c22093aa7f54 +SIZE (xpi/firefox-i18n-67.0b5/tr.xpi) = 486102 +SHA256 (xpi/firefox-i18n-67.0b5/uk.xpi) = bbe45601f1a8dc1d4c189e56b027f5cf2b727680e93c9d5c35d3b8952da0ca71 +SIZE (xpi/firefox-i18n-67.0b5/uk.xpi) = 524575 +SHA256 (xpi/firefox-i18n-67.0b5/ur.xpi) = a5a7de1cb0eab9cd615e2a105ac2c4c02078fe880678baa01db7ca71f4fb20ec +SIZE (xpi/firefox-i18n-67.0b5/ur.xpi) = 514745 +SHA256 (xpi/firefox-i18n-67.0b5/uz.xpi) = 5ee1b00a1c5081ffb58bb492a8e9230f608660d63eaa92f81d1393739a87fe53 +SIZE (xpi/firefox-i18n-67.0b5/uz.xpi) = 468299 +SHA256 (xpi/firefox-i18n-67.0b5/vi.xpi) = 1c11439c15c2819e11e7c4ed143ca232e71480ebe6db0d9ab7305d79716593db +SIZE (xpi/firefox-i18n-67.0b5/vi.xpi) = 495505 +SHA256 (xpi/firefox-i18n-67.0b5/xh.xpi) = 38121c16fc0b8a521bad9c8cd729f47584385d3a7d8831de7ecb92994a119522 +SIZE (xpi/firefox-i18n-67.0b5/xh.xpi) = 459538 +SHA256 (xpi/firefox-i18n-67.0b5/zh-CN.xpi) = cfc6b63af5fa4c434c0a4f0ef4bd51aa893d7952cbfb875554414d24cc33da51 +SIZE (xpi/firefox-i18n-67.0b5/zh-CN.xpi) = 499629 +SHA256 (xpi/firefox-i18n-67.0b5/zh-TW.xpi) = 0d79140cff89f7133cc29c7a454213748028a63bfa3bc07d293b89e92520db65 +SIZE (xpi/firefox-i18n-67.0b5/zh-TW.xpi) = 499902 Index: www/firefox/Makefile =================================================================== --- www/firefox/Makefile +++ www/firefox/Makefile @@ -2,18 +2,19 @@ # $FreeBSD$ PORTNAME= firefox -DISTVERSION= 66.0.1 +DISTVERSION= 67.0b5 PORTEPOCH= 1 CATEGORIES= www ipv6 MASTER_SITES= MOZILLA/${PORTNAME}/releases/${DISTVERSION}/source \ MOZILLA/${PORTNAME}/candidates/${DISTVERSION}-candidates/build1/source -DISTFILES= ${DISTNAME}.source${EXTRACT_SUFX} +DISTNAME= ${PORTNAME}-${PORTVERSION:R} +DISTFILES= ${DISTNAME}${PORTVERSION:E}.source${EXTRACT_SUFX} MAINTAINER= gecko@FreeBSD.org COMMENT= Web browser based on the browser portion of Mozilla BUILD_DEPENDS= nspr>=4.19:devel/nspr \ - nss>=3.42:security/nss \ + nss>=3.43:security/nss \ icu>=59.1,1:devel/icu \ libevent>=2.1.8:devel/libevent \ harfbuzz>=2.3.1:print/harfbuzz \ @@ -21,11 +22,11 @@ png>=1.6.35:graphics/png \ libvorbis>=1.3.6,3:audio/libvorbis \ libvpx>=1.5.0:multimedia/libvpx \ - sqlite3>=3.26:databases/sqlite3 \ + sqlite3>=3.27.2:databases/sqlite3 \ ${PYTHON_PKGNAMEPREFIX}sqlite3>0:databases/py-sqlite3@${PY_FLAVOR} \ v4l_compat>0:multimedia/v4l_compat \ autoconf-2.13:devel/autoconf213 \ - yasm:devel/yasm \ + nasm:devel/nasm \ zip:archivers/zip USE_GECKO= gecko Index: www/firefox/distinfo =================================================================== --- www/firefox/distinfo +++ www/firefox/distinfo @@ -1,3 +1,3 @@ -TIMESTAMP = 1553231190 -SHA256 (firefox-66.0.1.source.tar.xz) = 5dd072db4e96f8bbedc62cfab0de3c710d95f7c65fc676f90e1e86bc4b46fab2 -SIZE (firefox-66.0.1.source.tar.xz) = 281257896 +TIMESTAMP = 1553532864 +SHA256 (firefox-67.0b5.source.tar.xz) = 90454b44f6a45c6bfacdcd99fcbf395c04333c5198acc2bb23c361fba75b3d2e +SIZE (firefox-67.0b5.source.tar.xz) = 285509124 Index: www/firefox/files/patch-bug1514156 =================================================================== --- www/firefox/files/patch-bug1514156 +++ /dev/null @@ -1,590 +0,0 @@ -commit 94f519f31849 -Author: sotaro -Date: Tue Feb 12 16:32:51 2019 +0900 - - Bug 1514156 - Add GLContextEGL::CreateEGLSurfaceForCompositorWidget() for Wayland r=jgilbert - - When GDK_BACKEND is wayland, widget is not fully mapped during creating CompositorSession. During CompositorSession creation, GLContextProviderEGL::CreateForCompositorWidget() creates GLContextEGL, but we could not create valid EGLSurface. We could create valid EGLSurface when widget is fully mapped. CreateEGLSurfaceForCompositorWidget() is used for creating valid EGLSurface after widget is fully mapped. - - Differential Revision: https://phabricator.services.mozilla.com/D18654 ---- - gfx/gl/GLContextEGL.h | 4 ++++ - gfx/gl/GLContextProviderEGL.cpp | 27 +++++++++++++++++++++++++++ - 2 files changed, 31 insertions(+) - -diff --git gfx/gl/GLContextEGL.h gfx/gl/GLContextEGL.h -index 95d5e0c02e23..adb37e59a9f7 100644 ---- gfx/gl/GLContextEGL.h -+++ gfx/gl/GLContextEGL.h -@@ -91,6 +91,10 @@ class GLContextEGL : public GLContext { - CreateContextFlags flags, const gfx::IntSize& size, - const SurfaceCaps& minCaps, nsACString* const out_FailureId); - -+#if defined(MOZ_WAYLAND) -+ static EGLSurface CreateEGLSurfaceForCompositorWidget( -+ widget::CompositorWidget* aCompositorWidget, bool aForceAccelerated); -+#endif - protected: - friend class GLContextProviderEGL; - friend class GLContextEGLFactory; -diff --git gfx/gl/GLContextProviderEGL.cpp gfx/gl/GLContextProviderEGL.cpp -index 774eb34a8e87..25aa779a7d64 100644 ---- gfx/gl/GLContextProviderEGL.cpp -+++ gfx/gl/GLContextProviderEGL.cpp -@@ -295,6 +295,33 @@ already_AddRefed GLContextEGLFactory::Create( - return gl.forget(); - } - -+#if defined(MOZ_WAYLAND) -+/* static */ EGLSurface GLContextEGL::CreateEGLSurfaceForCompositorWidget( -+ widget::CompositorWidget* aCompositorWidget, bool aForceAccelerated) { -+ nsCString discardFailureId; -+ if (!GLLibraryEGL::EnsureInitialized(false, &discardFailureId)) { -+ gfxCriticalNote << "Failed to load EGL library 6!"; -+ return EGL_NO_SURFACE; -+ } -+ -+ MOZ_ASSERT(aCompositorWidget); -+ EGLNativeWindowType window = GET_NATIVE_WINDOW_FROM_COMPOSITOR_WIDGET(aCompositorWidget); -+ if (!window) { -+ gfxCriticalNote << "window is null"; -+ return EGL_NO_SURFACE; -+ } -+ const bool useWebRender = aCompositorWidget->GetCompositorOptions().UseWebRender(); -+ -+ EGLConfig config; -+ if (!CreateConfig(&config, useWebRender)) { -+ gfxCriticalNote << "Failed to create EGLConfig!"; -+ return EGL_NO_SURFACE; -+ } -+ -+ return mozilla::gl::CreateSurfaceFromNativeWindow(window, config); -+} -+#endif -+ - GLContextEGL::GLContextEGL(CreateContextFlags flags, const SurfaceCaps& caps, - bool isOffscreen, EGLConfig config, - EGLSurface surface, EGLContext context) - -commit 0e2cb6d4e88d -Author: sotaro -Date: Tue Feb 12 16:33:31 2019 +0900 - - Bug 1514156 - Add RenderCompositorEGL for wayland r=nical - - When GDK_BACKEND is wayland, widget is not fully mapped during creating CompositorSession. Needs to create valid EGLSurface after widget is fully mapped. - - Differential Revision: https://phabricator.services.mozilla.com/D18940 ---- - gfx/webrender_bindings/RenderCompositor.cpp | 12 +++ - gfx/webrender_bindings/RenderCompositorEGL.cpp | 132 +++++++++++++++++++++++++ - gfx/webrender_bindings/RenderCompositorEGL.h | 54 ++++++++++ - gfx/webrender_bindings/moz.build | 10 ++ - widget/gtk/CompositorWidgetChild.cpp | 6 ++ - widget/gtk/CompositorWidgetChild.h | 4 +- - widget/gtk/CompositorWidgetParent.cpp | 8 ++ - widget/gtk/CompositorWidgetParent.h | 2 + - widget/gtk/GtkCompositorWidget.cpp | 12 +++ - widget/gtk/GtkCompositorWidget.h | 11 +++ - widget/gtk/PCompositorWidget.ipdl | 1 + - widget/gtk/mozcontainer.cpp | 11 +++ - widget/gtk/mozcontainer.h | 2 + - widget/gtk/nsWindow.cpp | 16 +++ - widget/gtk/nsWindow.h | 1 + - 15 files changed, 281 insertions(+), 1 deletion(-) - -diff --git gfx/webrender_bindings/RenderCompositor.cpp gfx/webrender_bindings/RenderCompositor.cpp -index 051482fbabbf..a58268096a89 100644 ---- gfx/webrender_bindings/RenderCompositor.cpp -+++ gfx/webrender_bindings/RenderCompositor.cpp -@@ -16,6 +16,10 @@ - # include "mozilla/webrender/RenderCompositorANGLE.h" - #endif - -+#ifdef MOZ_WAYLAND -+#include "mozilla/webrender/RenderCompositorEGL.h" -+#endif -+ - namespace mozilla { - namespace wr { - -@@ -26,6 +30,14 @@ namespace wr { - return RenderCompositorANGLE::Create(std::move(aWidget)); - } - #endif -+ -+#ifdef MOZ_WAYLAND -+ UniquePtr eglCompositor = RenderCompositorEGL::Create(aWidget); -+ if (eglCompositor) { -+ return eglCompositor; -+ } -+#endif -+ - return RenderCompositorOGL::Create(std::move(aWidget)); - } - -diff --git gfx/webrender_bindings/RenderCompositorEGL.cpp gfx/webrender_bindings/RenderCompositorEGL.cpp -new file mode 100644 -index 000000000000..16245f59afbd ---- /dev/null -+++ gfx/webrender_bindings/RenderCompositorEGL.cpp -@@ -0,0 +1,132 @@ -+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -+/* vim: set ts=8 sts=2 et sw=2 tw=80: */ -+/* This Source Code Form is subject to the terms of the Mozilla Public -+ * License, v. 2.0. If a copy of the MPL was not distributed with this -+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -+ -+#include "RenderCompositorEGL.h" -+ -+#include "GLContext.h" -+#include "GLContextEGL.h" -+#include "GLContextProvider.h" -+#include "GLLibraryEGL.h" -+#include "mozilla/widget/CompositorWidget.h" -+#include "mozilla/widget/GtkCompositorWidget.h" -+ -+#include -+#include -+ -+namespace mozilla { -+namespace wr { -+ -+/* static */ UniquePtr RenderCompositorEGL::Create( -+ RefPtr aWidget) { -+ -+ if (GDK_IS_X11_DISPLAY(gdk_display_get_default())) { -+ return nullptr; -+ } -+ -+ RefPtr gl; -+ gl = CreateGLContext(aWidget); -+ if (!gl) { -+ return nullptr; -+ } -+ return MakeUnique(gl, aWidget); -+} -+ -+/* static */ already_AddRefed -+RenderCompositorEGL::CreateGLContext(RefPtr aWidget) { -+ nsCString discardFailureId; -+ -+ // Create GLContext with dummy EGLSurface. -+ RefPtr gl = -+ //XXX headless context did not work. -+ gl::GLContextProviderEGL::CreateForCompositorWidget(aWidget, true); -+ if (!gl) { -+ gfxCriticalNote << "Failed GL context creation for WebRender: " -+ << gfx::hexa(gl.get()); -+ return nullptr; -+ } -+ -+ if (!gl->MakeCurrent()) { -+ gfxCriticalNote << "Failed GL context creation for WebRender: " -+ << gfx::hexa(gl.get()); -+ return nullptr; -+ } -+ -+ return gl.forget(); -+} -+ -+/* static */ EGLSurface RenderCompositorEGL::CreateEGLSurface( -+ widget::CompositorWidget* aWidget) { -+ EGLSurface surface = EGL_NO_SURFACE; -+ surface = gl::GLContextEGL::CreateEGLSurfaceForCompositorWidget( -+ aWidget, /* aForceAccelerated */ true); -+ if (surface == EGL_NO_SURFACE) { -+ gfxCriticalNote << "Failed to create EGLSurface"; -+ } -+ return surface; -+} -+ -+RenderCompositorEGL::RenderCompositorEGL( -+ RefPtr aGL, RefPtr aWidget) -+ : RenderCompositor(std::move(aWidget)), mGL(aGL), mEGLSurface(EGL_NO_SURFACE) { -+ MOZ_ASSERT(mGL); -+} -+ -+RenderCompositorEGL::~RenderCompositorEGL() { -+ DestroyEGLSurface(); -+} -+ -+bool RenderCompositorEGL::BeginFrame() { -+ -+ if (mWidget->AsX11() && mWidget->AsX11()->WaylandRequestsUpdatingEGLSurface()) { -+ mEGLSurface = CreateEGLSurface(mWidget); -+ gl::GLContextEGL::Cast(gl())->SetEGLSurfaceOverride(mEGLSurface); -+ } -+ -+ if (!mGL->MakeCurrent()) { -+ gfxCriticalNote << "Failed to make render context current, can't draw."; -+ return false; -+ } -+ -+ return true; -+} -+ -+void RenderCompositorEGL::EndFrame() -+{ -+ if (mEGLSurface != EGL_NO_SURFACE) { -+ mGL->SwapBuffers(); -+ } -+} -+ -+void RenderCompositorEGL::WaitForGPU() {} -+ -+void RenderCompositorEGL::Pause() {} -+ -+bool RenderCompositorEGL::Resume() { -+ return true; -+} -+ -+bool RenderCompositorEGL::MakeCurrent() { -+ gl::GLContextEGL::Cast(gl())->SetEGLSurfaceOverride(mEGLSurface); -+ return gl()->MakeCurrent(); -+} -+ -+void RenderCompositorEGL::DestroyEGLSurface() { -+ auto* egl = gl::GLLibraryEGL::Get(); -+ -+ // Release EGLSurface of back buffer before calling ResizeBuffers(). -+ if (mEGLSurface) { -+ gl::GLContextEGL::Cast(gl())->SetEGLSurfaceOverride(EGL_NO_SURFACE); -+ egl->fDestroySurface(egl->Display(), mEGLSurface); -+ mEGLSurface = nullptr; -+ } -+} -+ -+LayoutDeviceIntSize RenderCompositorEGL::GetBufferSize() { -+ return mWidget->GetClientSize(); -+} -+ -+} // namespace wr -+} // namespace mozilla -diff --git gfx/webrender_bindings/RenderCompositorEGL.h gfx/webrender_bindings/RenderCompositorEGL.h -new file mode 100644 -index 000000000000..f12e16d974af ---- /dev/null -+++ gfx/webrender_bindings/RenderCompositorEGL.h -@@ -0,0 +1,54 @@ -+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ -+/* vim: set ts=8 sts=2 et sw=2 tw=80: */ -+/* This Source Code Form is subject to the terms of the Mozilla Public -+ * License, v. 2.0. If a copy of the MPL was not distributed with this -+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -+ -+#ifndef MOZILLA_GFX_RENDERCOMPOSITOR_EGL_H -+#define MOZILLA_GFX_RENDERCOMPOSITOR_EGL_H -+ -+#include "GLTypes.h" -+#include "mozilla/webrender/RenderCompositor.h" -+ -+namespace mozilla { -+ -+namespace wr { -+ -+class RenderCompositorEGL : public RenderCompositor { -+ public: -+ static UniquePtr Create( -+ RefPtr aWidget); -+ -+ RenderCompositorEGL(RefPtr aGL, -+ RefPtr aWidget); -+ virtual ~RenderCompositorEGL(); -+ -+ bool BeginFrame() override; -+ void EndFrame() override; -+ void WaitForGPU() override; -+ void Pause() override; -+ bool Resume() override; -+ -+ gl::GLContext* gl() const override { return mGL; } -+ -+ bool MakeCurrent() override; -+ -+ bool UseANGLE() const override { return false; } -+ -+ LayoutDeviceIntSize GetBufferSize() override; -+ -+ protected: -+ static already_AddRefed CreateGLContext( -+ RefPtr aWidget); -+ static EGLSurface CreateEGLSurface(widget::CompositorWidget* aWidget); -+ -+ void DestroyEGLSurface(); -+ -+ const RefPtr mGL; -+ EGLSurface mEGLSurface; -+}; -+ -+} // namespace wr -+} // namespace mozilla -+ -+#endif // MOZILLA_GFX_RENDERCOMPOSITOR_EGL_H -diff --git gfx/webrender_bindings/moz.build gfx/webrender_bindings/moz.build -index 4acdfbb817d3..f632bc5d24d9 100644 ---- gfx/webrender_bindings/moz.build -+++ gfx/webrender_bindings/moz.build -@@ -67,6 +67,14 @@ if CONFIG['MOZ_ENABLE_D3D10_LAYER']: - 'RenderCompositorANGLE.cpp', - ] - -+if CONFIG['MOZ_WAYLAND']: -+ EXPORTS.mozilla.webrender += [ -+ 'RenderCompositorEGL.h', -+ ] -+ SOURCES += [ -+ 'RenderCompositorEGL.cpp', -+ ] -+ - if CONFIG['MOZ_WIDGET_TOOLKIT'] in ('android', 'gtk3'): - CXXFLAGS += CONFIG['MOZ_CAIRO_CFLAGS'] - CXXFLAGS += CONFIG['CAIRO_FT_CFLAGS'] -@@ -75,5 +83,7 @@ include('/ipc/chromium/chromium-config.mozbuild') - - FINAL_LIBRARY = 'xul' - -+CXXFLAGS += CONFIG['TK_CFLAGS'] -+ - if CONFIG['CC_TYPE'] == 'clang-cl': - AllowCompilerWarnings() # workaround for bug 1090497 -diff --git widget/gtk/CompositorWidgetChild.cpp widget/gtk/CompositorWidgetChild.cpp -index b746fec0a283..07847a298707 100644 ---- widget/gtk/CompositorWidgetChild.cpp -+++ widget/gtk/CompositorWidgetChild.cpp -@@ -35,5 +35,11 @@ void CompositorWidgetChild::NotifyClientSizeChanged( - Unused << SendNotifyClientSizeChanged(aClientSize); - } - -+#ifdef MOZ_WAYLAND -+void CompositorWidgetChild::RequestsUpdatingEGLSurface() { -+ Unused << SendRequestsUpdatingEGLSurface(); -+} -+#endif -+ - } // namespace widget - } // namespace mozilla -diff --git widget/gtk/CompositorWidgetChild.h widget/gtk/CompositorWidgetChild.h -index fe3285eb6f22..0167dbb051c6 100644 ---- widget/gtk/CompositorWidgetChild.h -+++ widget/gtk/CompositorWidgetChild.h -@@ -24,7 +24,9 @@ class CompositorWidgetChild final : public PCompositorWidgetChild, - mozilla::ipc::IPCResult RecvUnobserveVsync() override; - - void NotifyClientSizeChanged(const LayoutDeviceIntSize& aClientSize) override; -- -+#ifdef MOZ_WAYLAND -+ void RequestsUpdatingEGLSurface() override; -+#endif - private: - RefPtr mVsyncDispatcher; - RefPtr mVsyncObserver; -diff --git widget/gtk/CompositorWidgetParent.cpp widget/gtk/CompositorWidgetParent.cpp -index ae49ec9174bc..b4031883d3a8 100644 ---- widget/gtk/CompositorWidgetParent.cpp -+++ widget/gtk/CompositorWidgetParent.cpp -@@ -40,5 +40,13 @@ mozilla::ipc::IPCResult CompositorWidgetParent::RecvNotifyClientSizeChanged( - return IPC_OK(); - } - -+mozilla::ipc::IPCResult CompositorWidgetParent::RecvRequestsUpdatingEGLSurface() -+{ -+#ifdef MOZ_WAYLAND -+ RequestsUpdatingEGLSurface(); -+#endif -+ return IPC_OK(); -+} -+ - } // namespace widget - } // namespace mozilla -diff --git widget/gtk/CompositorWidgetParent.h widget/gtk/CompositorWidgetParent.h -index 5d0ccfcf50f2..8ddd58a8964f 100644 ---- widget/gtk/CompositorWidgetParent.h -+++ widget/gtk/CompositorWidgetParent.h -@@ -27,6 +27,8 @@ class CompositorWidgetParent final : public PCompositorWidgetParent, - mozilla::ipc::IPCResult RecvNotifyClientSizeChanged( - const LayoutDeviceIntSize& aClientSize) override; - -+ mozilla::ipc::IPCResult RecvRequestsUpdatingEGLSurface() override; -+ - private: - RefPtr mVsyncObserver; - }; -diff --git widget/gtk/GtkCompositorWidget.cpp widget/gtk/GtkCompositorWidget.cpp -index bc21d6c4d05e..f787e8c23797 100644 ---- widget/gtk/GtkCompositorWidget.cpp -+++ widget/gtk/GtkCompositorWidget.cpp -@@ -85,6 +85,18 @@ void GtkCompositorWidget::NotifyClientSizeChanged( - mClientSize = aClientSize; - } - -+#ifdef MOZ_WAYLAND -+void GtkCompositorWidget::RequestsUpdatingEGLSurface() { -+ mWaylandRequestsUpdatingEGLSurface = true; -+} -+ -+bool GtkCompositorWidget::WaylandRequestsUpdatingEGLSurface() { -+ bool ret = mWaylandRequestsUpdatingEGLSurface; -+ mWaylandRequestsUpdatingEGLSurface = false; -+ return ret; -+} -+#endif -+ - LayoutDeviceIntSize GtkCompositorWidget::GetClientSize() { return mClientSize; } - - uintptr_t GtkCompositorWidget::GetWidgetKey() { -diff --git widget/gtk/GtkCompositorWidget.h widget/gtk/GtkCompositorWidget.h -index fd0c71426c18..75e156dffb02 100644 ---- widget/gtk/GtkCompositorWidget.h -+++ widget/gtk/GtkCompositorWidget.h -@@ -20,6 +20,10 @@ class PlatformCompositorWidgetDelegate : public CompositorWidgetDelegate { - virtual void NotifyClientSizeChanged( - const LayoutDeviceIntSize& aClientSize) = 0; - -+#ifdef MOZ_WAYLAND -+ virtual void RequestsUpdatingEGLSurface() = 0; -+#endif -+ - // CompositorWidgetDelegate Overrides - - PlatformCompositorWidgetDelegate* AsPlatformSpecificDelegate() override { -@@ -62,11 +66,18 @@ class GtkCompositorWidget : public CompositorWidget, - - void NotifyClientSizeChanged(const LayoutDeviceIntSize& aClientSize) override; - -+#ifdef MOZ_WAYLAND -+ void RequestsUpdatingEGLSurface() override; -+ bool WaylandRequestsUpdatingEGLSurface(); -+#endif - protected: - nsWindow* mWidget; - - private: - LayoutDeviceIntSize mClientSize; -+#ifdef MOZ_WAYLAND -+ bool mWaylandRequestsUpdatingEGLSurface = false; -+#endif - - Display* mXDisplay; - Window mXWindow; -diff --git widget/gtk/PCompositorWidget.ipdl widget/gtk/PCompositorWidget.ipdl -index 178fe78e4dc2..51390e400649 100644 ---- widget/gtk/PCompositorWidget.ipdl -+++ widget/gtk/PCompositorWidget.ipdl -@@ -19,6 +19,7 @@ parent: - async __delete__(); - - async NotifyClientSizeChanged(LayoutDeviceIntSize aClientSize); -+ async RequestsUpdatingEGLSurface(); - - child: - -diff --git widget/gtk/mozcontainer.cpp widget/gtk/mozcontainer.cpp -index 8be1f133d39f..8461e7b9d470 100644 ---- widget/gtk/mozcontainer.cpp -+++ widget/gtk/mozcontainer.cpp -@@ -159,6 +159,7 @@ void moz_container_init(MozContainer *container) { - // We can draw to x11 window any time. - container->ready_to_draw = GDK_IS_X11_DISPLAY(gdk_display_get_default()); - container->surface_needs_clear = true; -+ container->egl_surface_needs_update = false; - #endif - } - -@@ -176,6 +177,9 @@ static void frame_callback_handler(void *data, struct wl_callback *callback, - uint32_t time) { - MozContainer *container = MOZ_CONTAINER(data); - g_clear_pointer(&container->frame_callback_handler, wl_callback_destroy); -+ if (!container->ready_to_draw) { -+ container->egl_surface_needs_update = true; -+ } - container->ready_to_draw = true; - } - -@@ -208,6 +212,7 @@ static void moz_container_unmap_wayland(MozContainer *container) { - g_clear_pointer(&container->frame_callback_handler, wl_callback_destroy); - - container->surface_needs_clear = true; -+ container->egl_surface_needs_update = false; - container->ready_to_draw = false; - } - -@@ -555,6 +560,12 @@ gboolean moz_container_surface_needs_clear(MozContainer *container) { - container->surface_needs_clear = false; - return state; - } -+ -+gboolean moz_container_egl_surface_needs_update(MozContainer *container){ -+ gboolean state = container->egl_surface_needs_update; -+ container->egl_surface_needs_update = false; -+ return state; -+} - #endif - - void moz_container_force_default_visual(MozContainer *container) { -diff --git widget/gtk/mozcontainer.h widget/gtk/mozcontainer.h -index e9c218c1bc3e..1ed6f439805d 100644 ---- widget/gtk/mozcontainer.h -+++ widget/gtk/mozcontainer.h -@@ -77,6 +77,7 @@ struct _MozContainer { - struct wl_egl_window *eglwindow; - struct wl_callback *frame_callback_handler; - gboolean surface_needs_clear; -+ gboolean egl_surface_needs_update; - gboolean ready_to_draw; - #endif - gboolean force_default_visual; -@@ -100,6 +101,7 @@ gboolean moz_container_has_wl_egl_window(MozContainer *container); - gboolean moz_container_surface_needs_clear(MozContainer *container); - void moz_container_scale_changed(MozContainer *container, - GtkAllocation *aAllocation); -+gboolean moz_container_egl_surface_needs_update(MozContainer *container); - #endif - - #endif /* __MOZ_CONTAINER_H__ */ -diff --git widget/gtk/nsWindow.cpp widget/gtk/nsWindow.cpp -index 50e6354ea374..ceabbf583a42 100644 ---- widget/gtk/nsWindow.cpp -+++ widget/gtk/nsWindow.cpp -@@ -1886,6 +1886,11 @@ gboolean nsWindow::OnExposeEvent(cairo_t *cr) { - region.ScaleRoundOut(scale, scale); - - if (GetLayerManager()->AsKnowsCompositor() && mCompositorSession) { -+#ifdef MOZ_WAYLAND -+ if(mCompositorWidgetDelegate && WaylandRequestsUpdatingEGLSurface()) { -+ mCompositorWidgetDelegate->RequestsUpdatingEGLSurface(); -+ } -+#endif - // We need to paint to the screen even if nothing changed, since if we - // don't have a compositing window manager, our pixels could be stale. - GetLayerManager()->SetNeedsComposite(true); -@@ -6599,6 +6604,17 @@ bool nsWindow::WaylandSurfaceNeedsClear() { - "nsWindow::WaylandSurfaceNeedsClear(): We don't have any mContainer!"); - return false; - } -+ -+bool nsWindow::WaylandRequestsUpdatingEGLSurface() { -+ if (mContainer) { -+ return moz_container_egl_surface_needs_update(MOZ_CONTAINER(mContainer)); -+ } -+ -+ NS_WARNING( -+ "nsWindow::WaylandSurfaceNeedsClear(): We don't have any mContainer!"); -+ return false; -+} -+ - #endif - - #ifdef MOZ_X11 -diff --git widget/gtk/nsWindow.h widget/gtk/nsWindow.h -index b528ebfdeccb..ea0be70d7eb2 100644 ---- widget/gtk/nsWindow.h -+++ widget/gtk/nsWindow.h -@@ -373,6 +373,7 @@ class nsWindow final : public nsBaseWidget { - wl_display* GetWaylandDisplay(); - wl_surface* GetWaylandSurface(); - bool WaylandSurfaceNeedsClear(); -+ bool WaylandRequestsUpdatingEGLSurface(); - #endif - virtual void GetCompositorWidgetInitData( - mozilla::widget::CompositorWidgetInitData* aInitData) override; Index: www/firefox/files/patch-bug1527556 =================================================================== --- www/firefox/files/patch-bug1527556 +++ /dev/null @@ -1,31 +0,0 @@ -commit ba954951557e -Author: sotaro -Date: Mon Feb 18 09:15:30 2019 +0000 - - Bug 1527556 - Change MAX_DISPLAY_CONNECTIONS to 3 r=stransky - - nsWaylandDisplay needs to be allocated for each calling thread(main thread, compositor thread and render thread) - - Differential Revision: https://phabricator.services.mozilla.com/D20118 - - --HG-- - extra : moz-landing-system : lando ---- - widget/gtk/nsWaylandDisplay.cpp | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git widget/gtk/nsWaylandDisplay.cpp widget/gtk/nsWaylandDisplay.cpp -index 4c2804be2831..ac01e1f50347 100644 ---- widget/gtk/nsWaylandDisplay.cpp -+++ widget/gtk/nsWaylandDisplay.cpp -@@ -14,7 +14,9 @@ - namespace mozilla { - namespace widget { - --#define MAX_DISPLAY_CONNECTIONS 2 -+// nsWaylandDisplay needs to be created for each calling thread(main thread, -+// compositor thread and render thread) -+#define MAX_DISPLAY_CONNECTIONS 3 - - static nsWaylandDisplay *gWaylandDisplays[MAX_DISPLAY_CONNECTIONS]; - static StaticMutex gWaylandDisplaysMutex; Index: www/firefox/files/patch-bug1527804 =================================================================== --- www/firefox/files/patch-bug1527804 +++ /dev/null @@ -1,218 +0,0 @@ -commit 1579a88e491f -Author: sotaro -Date: Mon Feb 25 12:15:50 2019 +0000 - - Bug 1527804 - Trigger composite from frame_callback_handler() r=stransky - - Bug 1514156 expects that nsWindow::OnExposeEvent() is called after frame_callback_handler() called. But it did not happen during opening add-ons(gecko profiler). Then we need to trigger rendering directly from frame_callback_handler() call. - - Differential Revision: https://phabricator.services.mozilla.com/D20272 - - --HG-- - extra : moz-landing-system : lando ---- - widget/gtk/mozcontainer.cpp | 20 +++++++++---------- - widget/gtk/mozcontainer.h | 7 +++++-- - widget/gtk/nsWindow.cpp | 48 ++++++++++++++++++++++++++++++--------------- - widget/gtk/nsWindow.h | 4 +++- - 4 files changed, 50 insertions(+), 29 deletions(-) - -diff --git widget/gtk/mozcontainer.cpp widget/gtk/mozcontainer.cpp -index 77ac02e2a049..efe5f7ba86e3 100644 ---- widget/gtk/mozcontainer.cpp -+++ widget/gtk/mozcontainer.cpp -@@ -160,7 +160,7 @@ void moz_container_init(MozContainer *container) { - // We can draw to x11 window any time. - container->ready_to_draw = GDK_IS_X11_DISPLAY(gdk_display_get_default()); - container->surface_needs_clear = true; -- container->egl_surface_needs_update = false; -+ container->inital_draw_cb = nullptr; - #endif - } - -@@ -178,12 +178,18 @@ static void frame_callback_handler(void *data, struct wl_callback *callback, - uint32_t time) { - MozContainer *container = MOZ_CONTAINER(data); - g_clear_pointer(&container->frame_callback_handler, wl_callback_destroy); -- if (!container->ready_to_draw) { -- container->egl_surface_needs_update = true; -+ if (!container->ready_to_draw && container->inital_draw_cb) { -+ container->inital_draw_cb(); - } - container->ready_to_draw = true; - } - -+void moz_container_set_initial_draw_callback( -+ MozContainer *container, -+ std::function inital_draw_cb) { -+ container->inital_draw_cb = inital_draw_cb; -+} -+ - static const struct wl_callback_listener frame_listener = { - frame_callback_handler}; - -@@ -214,8 +220,8 @@ static void moz_container_unmap_wayland(MozContainer *container) { - g_clear_pointer(&container->frame_callback_handler, wl_callback_destroy); - - container->surface_needs_clear = true; -- container->egl_surface_needs_update = false; - container->ready_to_draw = false; -+ container->inital_draw_cb = nullptr; - } - - static gint moz_container_get_scale(MozContainer *container) { -@@ -560,12 +566,6 @@ gboolean moz_container_surface_needs_clear(MozContainer *container) { - container->surface_needs_clear = false; - return state; - } -- --gboolean moz_container_egl_surface_needs_update(MozContainer *container){ -- gboolean state = container->egl_surface_needs_update; -- container->egl_surface_needs_update = false; -- return state; --} - #endif - - void moz_container_force_default_visual(MozContainer *container) { -diff --git widget/gtk/mozcontainer.h widget/gtk/mozcontainer.h -index ae6d656646c8..51be814ef975 100644 ---- widget/gtk/mozcontainer.h -+++ widget/gtk/mozcontainer.h -@@ -9,6 +9,7 @@ - #define __MOZ_CONTAINER_H__ - - #include -+#include - - /* - * MozContainer -@@ -77,8 +78,8 @@ struct _MozContainer { - struct wl_egl_window *eglwindow; - struct wl_callback *frame_callback_handler; - gboolean surface_needs_clear; -- gboolean egl_surface_needs_update; - gboolean ready_to_draw; -+ std::function inital_draw_cb; - #endif - gboolean force_default_visual; - }; -@@ -101,7 +102,9 @@ gboolean moz_container_has_wl_egl_window(MozContainer *container); - gboolean moz_container_surface_needs_clear(MozContainer *container); - void moz_container_scale_changed(MozContainer *container, - GtkAllocation *aAllocation); --gboolean moz_container_egl_surface_needs_update(MozContainer *container); -+void moz_container_set_initial_draw_callback( -+ MozContainer *container, -+ std::function inital_draw_cb); - #endif - - #endif /* __MOZ_CONTAINER_H__ */ -diff --git widget/gtk/nsWindow.cpp widget/gtk/nsWindow.cpp -index acb957d3fb55..54b121ec5514 100644 ---- widget/gtk/nsWindow.cpp -+++ widget/gtk/nsWindow.cpp -@@ -675,6 +675,12 @@ void nsWindow::Destroy() { - gFocusWindow = nullptr; - } - -+#ifdef MOZ_WAYLAND -+ if (mContainer) { -+ moz_container_set_initial_draw_callback(mContainer, nullptr); -+ } -+#endif -+ - GtkWidget *owningWidget = GetMozContainerWidget(); - if (mShell) { - gtk_widget_destroy(mShell); -@@ -1860,6 +1866,23 @@ static bool ExtractExposeRegion(LayoutDeviceIntRegion &aRegion, cairo_t *cr) { - return true; - } - -+#ifdef MOZ_WAYLAND -+void nsWindow::WaylandEGLSurfaceForceRedraw() { -+ MOZ_RELEASE_ASSERT(NS_IsMainThread()); -+ -+ if (mIsDestroyed) { -+ return; -+ } -+ -+ if (CompositorBridgeChild* remoteRenderer = GetRemoteRenderer()) { -+ if (mCompositorWidgetDelegate) { -+ mCompositorWidgetDelegate->RequestsUpdatingEGLSurface(); -+ } -+ remoteRenderer->SendForcePresent(); -+ } -+} -+#endif -+ - gboolean nsWindow::OnExposeEvent(cairo_t *cr) { - // Send any pending resize events so that layout can update. - // May run event loop. -@@ -1888,11 +1911,6 @@ gboolean nsWindow::OnExposeEvent(cairo_t *cr) { - region.ScaleRoundOut(scale, scale); - - if (GetLayerManager()->AsKnowsCompositor() && mCompositorSession) { --#ifdef MOZ_WAYLAND -- if(mCompositorWidgetDelegate && WaylandRequestsUpdatingEGLSurface()) { -- mCompositorWidgetDelegate->RequestsUpdatingEGLSurface(); -- } --#endif - // We need to paint to the screen even if nothing changed, since if we - // don't have a compositing window manager, our pixels could be stale. - GetLayerManager()->SetNeedsComposite(true); -@@ -3454,6 +3472,15 @@ nsresult nsWindow::Create(nsIWidget *aParent, nsNativeWidget aNativeParent, - // Create a container to hold child windows and child GtkWidgets. - GtkWidget *container = moz_container_new(); - mContainer = MOZ_CONTAINER(container); -+#ifdef MOZ_WAYLAND -+ if (!mIsX11Display && ComputeShouldAccelerate()) { -+ RefPtr self(this); -+ moz_container_set_initial_draw_callback(mContainer, -+ [self]() -> void { -+ self->WaylandEGLSurfaceForceRedraw(); -+ }); -+ } -+#endif - - // "csd" style is set when widget is realized so we need to call - // it explicitly now. -@@ -6564,17 +6591,6 @@ bool nsWindow::WaylandSurfaceNeedsClear() { - "nsWindow::WaylandSurfaceNeedsClear(): We don't have any mContainer!"); - return false; - } -- --bool nsWindow::WaylandRequestsUpdatingEGLSurface() { -- if (mContainer) { -- return moz_container_egl_surface_needs_update(MOZ_CONTAINER(mContainer)); -- } -- -- NS_WARNING( -- "nsWindow::WaylandSurfaceNeedsClear(): We don't have any mContainer!"); -- return false; --} -- - #endif - - #ifdef MOZ_X11 -diff --git widget/gtk/nsWindow.h widget/gtk/nsWindow.h -index 5d119b4911e1..dbced693be1c 100644 ---- widget/gtk/nsWindow.h -+++ widget/gtk/nsWindow.h -@@ -245,6 +245,9 @@ class nsWindow final : public nsBaseWidget { - - void DispatchContextMenuEventFromMouseEvent(uint16_t domButton, - GdkEventButton* aEvent); -+#ifdef MOZ_WAYLAND -+ void WaylandEGLSurfaceForceRedraw(); -+#endif - - public: - void ThemeChanged(void); -@@ -342,7 +345,6 @@ class nsWindow final : public nsBaseWidget { - wl_display* GetWaylandDisplay(); - wl_surface* GetWaylandSurface(); - bool WaylandSurfaceNeedsClear(); -- bool WaylandRequestsUpdatingEGLSurface(); - #endif - virtual void GetCompositorWidgetInitData( - mozilla::widget::CompositorWidgetInitData* aInitData) override; Index: www/firefox/files/patch-bug1532024 =================================================================== --- www/firefox/files/patch-bug1532024 +++ /dev/null @@ -1,106 +0,0 @@ -commit 554777b1d130 -Author: sotaro -Date: Wed Mar 6 08:17:51 2019 +0000 - - Bug 1532024 - Handle a case that GetRemoteRenderer() returned nullptr r=stransky - - Differential Revision: https://phabricator.services.mozilla.com/D21831 - - --HG-- - extra : moz-landing-system : lando ---- - widget/gtk/nsWindow.cpp | 27 ++++++++++++++++++--------- - widget/gtk/nsWindow.h | 3 +++ - 2 files changed, 21 insertions(+), 9 deletions(-) - -diff --git widget/gtk/nsWindow.cpp widget/gtk/nsWindow.cpp -index 96faeea1cf87..46956412f341 100644 ---- widget/gtk/nsWindow.cpp -+++ widget/gtk/nsWindow.cpp -@@ -399,6 +399,10 @@ nsWindow::nsWindow() { - mXDepth = 0; - #endif /* MOZ_X11 */ - -+#ifdef MOZ_WAYLAND -+ mNeedsUpdatingEGLSurface = false; -+#endif -+ - if (!gGlobalsInitialized) { - gGlobalsInitialized = true; - -@@ -1872,12 +1876,14 @@ static bool ExtractExposeRegion(LayoutDeviceIntRegion &aRegion, cairo_t *cr) { - void nsWindow::WaylandEGLSurfaceForceRedraw() { - MOZ_RELEASE_ASSERT(NS_IsMainThread()); - -- if (mIsDestroyed) { -+ if (mIsDestroyed || !mNeedsUpdatingEGLSurface) { - return; - } - -- if (CompositorBridgeChild* remoteRenderer = GetRemoteRenderer()) { -+ if (CompositorBridgeChild *remoteRenderer = GetRemoteRenderer()) { -+ MOZ_ASSERT(mCompositorWidgetDelegate); - if (mCompositorWidgetDelegate) { -+ mNeedsUpdatingEGLSurface = false; - mCompositorWidgetDelegate->RequestsUpdatingEGLSurface(); - } - remoteRenderer->SendForcePresent(); -@@ -3483,10 +3489,10 @@ nsresult nsWindow::Create(nsIWidget *aParent, nsNativeWidget aNativeParent, - #ifdef MOZ_WAYLAND - if (!mIsX11Display && ComputeShouldAccelerate()) { - RefPtr self(this); -- moz_container_set_initial_draw_callback(mContainer, -- [self]() -> void { -- self->WaylandEGLSurfaceForceRedraw(); -- }); -+ moz_container_set_initial_draw_callback(mContainer, [self]() -> void { -+ self->mNeedsUpdatingEGLSurface = true; -+ self->WaylandEGLSurfaceForceRedraw(); -+ }); - } - #endif - -@@ -6058,6 +6064,9 @@ void nsWindow::SetCompositorWidgetDelegate(CompositorWidgetDelegate *delegate) { - MOZ_ASSERT(mCompositorWidgetDelegate, - "nsWindow::SetCompositorWidgetDelegate called with a " - "non-PlatformCompositorWidgetDelegate"); -+#ifdef MOZ_WAYLAND -+ WaylandEGLSurfaceForceRedraw(); -+#endif - } else { - mCompositorWidgetDelegate = nullptr; - } -@@ -6543,8 +6552,7 @@ nsWindow::CSDSupportLevel nsWindow::GetSystemCSDSupportLevel() { - // Check for Mutter regression on X.org (Bug 1530252). In that case we - // don't hide system titlebar by default as we can't draw transparent - // corners reliably. --bool nsWindow::TitlebarCanUseShapeMask() --{ -+bool nsWindow::TitlebarCanUseShapeMask() { - static int canUseShapeMask = -1; - if (canUseShapeMask != -1) { - return canUseShapeMask; -@@ -6574,7 +6582,8 @@ bool nsWindow::HideTitlebarByDefault() { - // When user defined widget.default-hidden-titlebar don't do any - // heuristics and just follow it. - if (Preferences::HasUserValue("widget.default-hidden-titlebar")) { -- hideTitlebar = Preferences::GetBool("widget.default-hidden-titlebar", false); -+ hideTitlebar = -+ Preferences::GetBool("widget.default-hidden-titlebar", false); - return hideTitlebar; - } - -diff --git widget/gtk/nsWindow.h widget/gtk/nsWindow.h -index 309905757431..ae2ebd350af4 100644 ---- widget/gtk/nsWindow.h -+++ widget/gtk/nsWindow.h -@@ -433,6 +433,9 @@ class nsWindow final : public nsBaseWidget { - bool mIsDragPopup; - // Can we access X? - bool mIsX11Display; -+#ifdef MOZ_WAYLAND -+ bool mNeedsUpdatingEGLSurface; -+#endif - - private: - void DestroyChildWindows(); Index: www/firefox/files/patch-bug1533559 =================================================================== --- /dev/null +++ www/firefox/files/patch-bug1533559 @@ -0,0 +1,24808 @@ +commit eed1f33ab912 +Author: Thomas Daede +Date: Tue Mar 19 17:35:09 2019 +0000 + + Bug 1533559 - Update libdav1d to 0.2.1. r=achronop + + Differential Revision: https://phabricator.services.mozilla.com/D23761 + + --HG-- + rename : media/libdav1d/version.h => media/libdav1d/vcs_version.h + rename : third_party/dav1d/include/meson.build => third_party/dav1d/include/dav1d/meson.build + rename : third_party/dav1d/src/obu.h => third_party/dav1d/include/dav1d/version.h.in + rename : third_party/dav1d/include/version.h.in => third_party/dav1d/include/vcs_version.h.in + rename : third_party/dav1d/src/thread_task.h => third_party/dav1d/src/log.c + rename : third_party/dav1d/src/cpu.h => third_party/dav1d/src/log.h + extra : moz-landing-system : lando +--- + media/libdav1d/README_MOZILLA | 4 - + media/libdav1d/asm/moz.build | 2 + + media/libdav1d/dav1d.rc | 18 +- + media/libdav1d/moz.build | 2 + + media/libdav1d/moz.yaml | 2 +- + media/libdav1d/vcs_version.h | 2 + + media/libdav1d/version.h | 36 +- + python/mozbuild/mozbuild/vendor_dav1d.py | 14 +- + third_party/dav1d/.gitlab-ci.yml | 181 +- + third_party/dav1d/COPYING | 2 +- + third_party/dav1d/NEWS | 22 +- + third_party/dav1d/README.md | 8 +- + third_party/dav1d/THANKS.md | 2 +- + third_party/dav1d/include/common/attributes.h | 8 +- + third_party/dav1d/include/common/bitdepth.h | 6 +- + third_party/dav1d/include/common/dump.h | 6 +- + third_party/dav1d/include/common/intops.h | 6 +- + third_party/dav1d/include/common/mem.h | 6 +- + third_party/dav1d/include/common/validate.h | 6 +- + third_party/dav1d/include/dav1d/common.h | 6 +- + third_party/dav1d/include/dav1d/data.h | 8 +- + third_party/dav1d/include/dav1d/dav1d.h | 23 +- + third_party/dav1d/include/dav1d/headers.h | 53 +- + third_party/dav1d/include/dav1d/meson.build | 41 + + third_party/dav1d/include/dav1d/picture.h | 39 +- + third_party/dav1d/include/dav1d/version.h.in | 34 + + third_party/dav1d/include/meson.build | 9 +- + third_party/dav1d/include/vcs_version.h.in | 2 + + third_party/dav1d/include/version.h.in | 2 - + third_party/dav1d/meson.build | 39 +- + third_party/dav1d/meson_options.txt | 9 + + third_party/dav1d/snap/snapcraft.yaml | 24 + + third_party/dav1d/src/arm/32/looprestoration.S | 685 +++++ + third_party/dav1d/src/arm/32/mc.S | 2117 +++++++++++++ + third_party/dav1d/src/arm/32/util.S | 34 +- + third_party/dav1d/src/arm/64/cdef.S | 603 ++++ + third_party/dav1d/src/arm/64/looprestoration.S | 58 +- + third_party/dav1d/src/arm/64/mc.S | 323 +- + third_party/dav1d/src/arm/64/util.S | 35 +- + third_party/dav1d/src/arm/asm.S | 11 +- + third_party/dav1d/src/arm/cdef_init_tmpl.c | 86 + + third_party/dav1d/src/arm/cpu.c | 2 +- + third_party/dav1d/src/arm/cpu.h | 6 +- + .../dav1d/src/arm/looprestoration_init_tmpl.c | 8 +- + third_party/dav1d/src/arm/mc_init_tmpl.c | 9 +- + third_party/dav1d/src/cdef.h | 15 +- + third_party/dav1d/src/cdef_apply.h | 6 +- + third_party/dav1d/src/cdef_apply_tmpl.c | 28 +- + third_party/dav1d/src/cdef_tmpl.c | 35 +- + third_party/dav1d/src/cdf.h | 6 +- + third_party/dav1d/src/cpu.h | 6 +- + third_party/dav1d/src/ctx.h | 6 +- + third_party/dav1d/src/data.c | 22 +- + third_party/dav1d/src/data.h | 8 +- + third_party/dav1d/src/dav1d.rc.in | 16 +- + third_party/dav1d/src/decode.c | 445 +-- + third_party/dav1d/src/decode.h | 6 +- + third_party/dav1d/src/dequant_tables.h | 6 +- + third_party/dav1d/src/env.h | 10 +- + third_party/dav1d/src/ext/x86/x86inc.asm | 181 +- + third_party/dav1d/src/film_grain.h | 6 +- + third_party/dav1d/src/getbits.c | 23 +- + third_party/dav1d/src/getbits.h | 9 +- + third_party/dav1d/src/internal.h | 33 +- + third_party/dav1d/src/intra_edge.h | 6 +- + third_party/dav1d/src/ipred.h | 6 +- + third_party/dav1d/src/ipred_prepare.h | 10 +- + third_party/dav1d/src/ipred_prepare_tmpl.c | 4 +- + third_party/dav1d/src/ipred_tmpl.c | 2 +- + third_party/dav1d/src/itx.h | 6 +- + third_party/dav1d/src/itx_tmpl.c | 68 +- + third_party/dav1d/src/levels.h | 14 +- + third_party/dav1d/src/lf_apply.h | 6 +- + third_party/dav1d/src/lf_mask.c | 8 +- + third_party/dav1d/src/lf_mask.h | 6 +- + third_party/dav1d/src/lib.c | 133 +- + third_party/dav1d/src/log.c | 57 + + third_party/dav1d/src/log.h | 47 + + third_party/dav1d/src/loopfilter.h | 6 +- + third_party/dav1d/src/looprestoration.h | 6 +- + third_party/dav1d/src/looprestoration_tmpl.c | 8 +- + third_party/dav1d/src/lr_apply.h | 6 +- + third_party/dav1d/src/lr_apply_tmpl.c | 8 +- + third_party/dav1d/src/mc.h | 6 +- + third_party/dav1d/src/mc_tmpl.c | 75 +- + third_party/dav1d/src/meson.build | 27 +- + third_party/dav1d/src/msac.c | 102 +- + third_party/dav1d/src/msac.h | 29 +- + third_party/dav1d/src/obu.c | 134 +- + third_party/dav1d/src/obu.h | 6 +- + third_party/dav1d/src/picture.c | 71 +- + third_party/dav1d/src/picture.h | 15 +- + third_party/dav1d/src/qm.h | 6 +- + third_party/dav1d/src/recon.h | 6 +- + third_party/dav1d/src/recon_tmpl.c | 58 +- + third_party/dav1d/src/ref.h | 6 +- + third_party/dav1d/src/ref_mvs.c | 4 +- + third_party/dav1d/src/ref_mvs.h | 6 +- + third_party/dav1d/src/scan.h | 6 +- + third_party/dav1d/src/tables.c | 237 +- + third_party/dav1d/src/tables.h | 8 +- + third_party/dav1d/src/thread.h | 64 +- + third_party/dav1d/src/thread_data.h | 7 +- + third_party/dav1d/src/thread_task.c | 2 +- + third_party/dav1d/src/thread_task.h | 6 +- + third_party/dav1d/src/warpmv.c | 12 +- + third_party/dav1d/src/warpmv.h | 6 +- + third_party/dav1d/src/wedge.h | 6 +- + third_party/dav1d/src/win32/thread.c | 73 +- + third_party/dav1d/src/x86/cdef.asm | 284 +- + third_party/dav1d/src/x86/cdef_init_tmpl.c | 17 + + third_party/dav1d/src/x86/cdef_ssse3.asm | 1306 ++++++++ + third_party/dav1d/src/x86/cpu.c | 3 +- + third_party/dav1d/src/x86/cpu.h | 6 +- + third_party/dav1d/src/x86/ipred.asm | 16 +- + third_party/dav1d/src/x86/ipred_init_tmpl.c | 20 + + third_party/dav1d/src/x86/ipred_ssse3.asm | 1144 ++++++- + third_party/dav1d/src/x86/itx_init_tmpl.c | 26 +- + third_party/dav1d/src/x86/itx_ssse3.asm | 2601 ++++++++++++++-- + third_party/dav1d/src/x86/looprestoration.asm | 232 +- + .../dav1d/src/x86/looprestoration_init_tmpl.c | 316 +- + .../dav1d/src/x86/looprestoration_ssse3.asm | 1826 ++++++++++++ + third_party/dav1d/src/x86/mc.asm | 664 +++-- + third_party/dav1d/src/x86/mc_init_tmpl.c | 43 + + third_party/dav1d/src/x86/mc_ssse3.asm | 3154 ++++++++++++++++++-- + third_party/dav1d/tests/checkasm/cdef.c | 8 +- + third_party/dav1d/tests/checkasm/checkasm.c | 21 +- + third_party/dav1d/tests/checkasm/checkasm.h | 6 +- + third_party/dav1d/tests/checkasm/itx.c | 2 +- + third_party/dav1d/tests/checkasm/mc.c | 103 +- + third_party/dav1d/tests/libfuzzer/alloc_fail.c | 37 + + third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h | 6 +- + third_party/dav1d/tests/libfuzzer/main.c | 15 +- + third_party/dav1d/tests/meson.build | 3 + + third_party/dav1d/tools/dav1d.c | 9 +- + third_party/dav1d/tools/dav1d_cli_parse.c | 8 +- + third_party/dav1d/tools/dav1d_cli_parse.h | 8 +- + third_party/dav1d/tools/input/annexb.c | 6 +- + third_party/dav1d/tools/input/demuxer.h | 6 +- + third_party/dav1d/tools/input/input.c | 2 +- + third_party/dav1d/tools/input/input.h | 6 +- + third_party/dav1d/tools/input/ivf.c | 12 +- + third_party/dav1d/tools/output/md5.c | 74 +- + third_party/dav1d/tools/output/muxer.h | 6 +- + third_party/dav1d/tools/output/output.c | 2 +- + third_party/dav1d/tools/output/output.h | 6 +- + 146 files changed, 16492 insertions(+), 2363 deletions(-) + +diff --git media/libdav1d/README_MOZILLA media/libdav1d/README_MOZILLA +index 67fba4eec3ec..8e1cd4a4bf0e 100644 +--- media/libdav1d/README_MOZILLA ++++ media/libdav1d/README_MOZILLA +@@ -18,7 +18,3 @@ The upstream git repository is https://aomedia.googlesource.com/aom + To update to a fork, use + + ./mach vendor dav1d --repo [-r ] +- +-The last update was pulled from https://code.videolan.org/videolan/dav1d +- +-The git commit ID used was 197a19ad702d5e7472852efcde98feeb07f373e0 (2018-11-26T12:15:41.000Z). +diff --git media/libdav1d/asm/moz.build media/libdav1d/asm/moz.build +index 4b2c906888d2..1cfa59cc17aa 100644 +--- media/libdav1d/asm/moz.build ++++ media/libdav1d/asm/moz.build +@@ -74,9 +74,11 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'): + ] + + SOURCES += [ ++ '../../../third_party/dav1d/src/x86/cdef_ssse3.asm', + '../../../third_party/dav1d/src/x86/cpuid.asm', + '../../../third_party/dav1d/src/x86/ipred_ssse3.asm', + '../../../third_party/dav1d/src/x86/itx_ssse3.asm', ++ '../../../third_party/dav1d/src/x86/looprestoration_ssse3.asm', + '../../../third_party/dav1d/src/x86/mc_ssse3.asm', + ] + +diff --git media/libdav1d/dav1d.rc media/libdav1d/dav1d.rc +index 4e38ea6d8b13..d6fd7648722f 100644 +--- media/libdav1d/dav1d.rc ++++ media/libdav1d/dav1d.rc +@@ -1,13 +1,15 @@ +-#define VERSION_NUMBER 0,0,1,0 +-#define VERSION_NUMBER_STR "0.0.1.0" ++#define API_VERSION_NUMBER 1,0,1,0 ++#define API_VERSION_NUMBER_STR "1.0.1" ++#define PROJECT_VERSION_NUMBER 0,2,2,0 ++#define PROJECT_VERSION_NUMBER_STR "0.2.2" + + #include + + 1 VERSIONINFO + FILETYPE VFT_DLL + FILEOS VOS_NT_WINDOWS32 +-PRODUCTVERSION VERSION_NUMBER +-FILEVERSION VERSION_NUMBER ++PRODUCTVERSION PROJECT_VERSION_NUMBER ++FILEVERSION API_VERSION_NUMBER + BEGIN + BLOCK "StringFileInfo" + BEGIN +@@ -15,12 +17,12 @@ BEGIN + BEGIN + VALUE "CompanyName", "VideoLAN" + VALUE "ProductName", "dav1d" +- VALUE "ProductVersion", VERSION_NUMBER_STR +- VALUE "FileVersion", VERSION_NUMBER_STR +- VALUE "FileDescription", "dav1d AV1 decoder" ++ VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR ++ VALUE "FileVersion", API_VERSION_NUMBER_STR ++ VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder" + VALUE "InternalName", "dav1d" + VALUE "OriginalFilename", "libdav1d.dll" +- VALUE "LegalCopyright", "Copyright \251 2018 VideoLAN and dav1d Authors" ++ VALUE "LegalCopyright", "Copyright \251 2019 VideoLAN and dav1d Authors" + END + END + BLOCK "VarFileInfo" +diff --git media/libdav1d/moz.build media/libdav1d/moz.build +index 4afc99841b3d..45a5684c1d1d 100644 +--- media/libdav1d/moz.build ++++ media/libdav1d/moz.build +@@ -61,6 +61,7 @@ SOURCES += [ + '../../third_party/dav1d/src/getbits.c', + '../../third_party/dav1d/src/intra_edge.c', + '../../third_party/dav1d/src/lf_mask.c', ++ '../../third_party/dav1d/src/log.c', + '../../third_party/dav1d/src/msac.c', + '../../third_party/dav1d/src/obu.c', + '../../third_party/dav1d/src/picture.c', +@@ -85,6 +86,7 @@ EXPORTS.dav1d.src += [ + '../../third_party/dav1d/src/getbits.h', + '../../third_party/dav1d/src/intra_edge.h', + '../../third_party/dav1d/src/lf_mask.h', ++ '../../third_party/dav1d/src/log.h', + '../../third_party/dav1d/src/msac.h', + '../../third_party/dav1d/src/obu.h', + '../../third_party/dav1d/src/picture.h', +diff --git media/libdav1d/moz.yaml media/libdav1d/moz.yaml +index 3c6111989010..6f8c1e15a4a4 100644 +--- media/libdav1d/moz.yaml ++++ media/libdav1d/moz.yaml +@@ -20,7 +20,7 @@ origin: + + # Human-readable identifier for this version/release + # Generally "version NNN", "tag SSS", "bookmark SSS" +- release: commit f813285c1d1a5421e0180efbb7cbdd377cd31c69 (2019-01-13T22:08:25.000Z). ++ release: commit 7350c59e7894cb7e487a0add9942d2b1b39f7161 (2019-03-16T23:17:05.000Z). + + # The package's license, where possible using the mnemonic from + # https://spdx.org/licenses/ +diff --git media/libdav1d/vcs_version.h media/libdav1d/vcs_version.h +new file mode 100644 +index 000000000000..884fbe72478e +--- /dev/null ++++ media/libdav1d/vcs_version.h +@@ -0,0 +1,2 @@ ++/* auto-generated, do not edit */ ++#define DAV1D_VERSION "0.2.2" +diff --git media/libdav1d/version.h media/libdav1d/version.h +index 94b4261e13c5..090907b1dc74 100644 +--- media/libdav1d/version.h ++++ media/libdav1d/version.h +@@ -1,2 +1,34 @@ +-/* auto-generated, do not edit */ +-#define DAV1D_VERSION "0.1.1" ++/* ++ * Copyright © 2019, VideoLAN and dav1d authors ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright notice, this ++ * list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above copyright notice, ++ * this list of conditions and the following disclaimer in the documentation ++ * and/or other materials provided with the distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef DAV1D_VERSION_H ++#define DAV1D_VERSION_H ++ ++#define DAV1D_API_VERSION_MAJOR 1 ++#define DAV1D_API_VERSION_MINOR 0 ++#define DAV1D_API_VERSION_PATCH 1 ++ ++#endif /* DAV1D_VERSION_H */ +diff --git python/mozbuild/mozbuild/vendor_dav1d.py python/mozbuild/mozbuild/vendor_dav1d.py +index 72db938afffa..27b98c980658 100644 +--- python/mozbuild/mozbuild/vendor_dav1d.py ++++ python/mozbuild/mozbuild/vendor_dav1d.py +@@ -105,6 +105,16 @@ Please set a repository url with --repo on either googlesource or github.''' % h + with open(filename, 'w') as f: + f.write(new_yaml) + ++ def update_vcs_version(self, revision, vendor_dir, glue_dir): ++ src_filename = mozpath.join(vendor_dir, 'include/vcs_version.h.in') ++ dst_filename = mozpath.join(glue_dir, 'vcs_version.h') ++ with open(src_filename) as f: ++ vcs_version_in = f.read() ++ vcs_version = vcs_version_in.replace('@VCS_TAG@', revision) ++ with open(dst_filename, 'w') as f: ++ f.write(vcs_version) ++ ++ + def clean_upstream(self, target): + '''Remove files we don't want to import.''' + mozfile.remove(mozpath.join(target, '.gitattributes')) +@@ -154,7 +164,9 @@ Please commit or stash these changes before vendoring, or re-run with `--ignore- + self.log(logging.INFO, 'update_moz.yaml', {}, + '''Updating moz.yaml.''') + self.update_yaml(commit, timestamp, glue_dir) +- self.repository.add_remove_files(vendor_dir) ++ self.log(logging.INFO, 'update_vcs_version', {}, ++ '''Updating vcs_version.h.''') ++ self.update_vcs_version(commit, vendor_dir, glue_dir) + self.log(logging.INFO, 'add_remove_files', {}, + '''Registering changes with version control.''') + self.repository.add_remove_files(vendor_dir) +diff --git third_party/dav1d/.gitlab-ci.yml third_party/dav1d/.gitlab-ci.yml +index 8d4233cc5672..c22bf54fe24f 100644 +--- third_party/dav1d/.gitlab-ci.yml ++++ third_party/dav1d/.gitlab-ci.yml +@@ -4,17 +4,32 @@ stages: + - test + + style-check: +- image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132 ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 + stage: style + tags: + - debian + - amd64 + script: +- - git grep -n -e $'\t' --or -e $'\r' -- . ':(exclude)*/compat/*' && exit 1 +- - /bin/true ++ - git grep -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && exit 1 ++ - git grep -n -i "david" -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && exit 1 ++ - git remote rm upstream 2> /dev/null || true ++ - git remote add upstream https://code.videolan.org/videolan/dav1d.git ++ - git fetch -q upstream master ++ - for i in $(git rev-list HEAD ^upstream/master); do ++ echo "Checking commit message of $i"; ++ msg="$(git log --format=%B -n 1 $i)"; ++ if [ -n "$(echo "$msg" | awk "NR==2")" ]; then ++ echo "Malformed commit message in $i, second line must be empty"; ++ exit 1; ++ fi; ++ if echo "$msg" | head -1 | grep -q '\.$'; then ++ echo "Malformed commit message in $i, trailing period in subject line"; ++ exit 1; ++ fi; ++ done + + build-debian: +- image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132 ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 + stage: build + tags: + - debian +@@ -25,7 +40,7 @@ build-debian: + - cd build && meson test -v + + build-debian-static: +- image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132 ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 + stage: build + tags: + - debian +@@ -49,11 +64,13 @@ build-debian32: + - cd build && meson test -v + + build-win32: +- image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132 ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 + stage: build + tags: +- - win32 ++ - debian ++ - amd64 + script: ++ - wineserver -p && wine wineboot + - meson build --buildtype release + --werror + --libdir lib +@@ -62,18 +79,36 @@ build-win32: + -Ddefault_library=both + - ninja -C build + - ninja -C build install ++ - cd build && meson test -v + artifacts: + name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" + paths: + - build/dav1d_install/ + expire_in: 1 week + ++build-win32-unaligned-stack: ++ image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190218133533 ++ stage: build ++ tags: ++ - debian ++ - amd64 ++ script: ++ - wineserver -p && wine wineboot ++ - meson build --buildtype release ++ --werror ++ --cross-file /opt/crossfiles/i686-w64-mingw32.meson ++ -Dstack_alignment=4 ++ - ninja -C build ++ - cd build && meson test -v ++ + build-win64: +- image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132 ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 + stage: build + tags: +- - win64 ++ - debian ++ - amd64 + script: ++ - wineserver -p && wine wineboot + - meson build --buildtype release + --werror + --libdir lib +@@ -82,6 +117,43 @@ build-win64: + -Ddefault_library=both + - ninja -C build + - ninja -C build install ++ - cd build && meson test -v ++ artifacts: ++ name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" ++ paths: ++ - build/dav1d_install/ ++ expire_in: 1 week ++ ++build-win-arm32: ++ image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190218133533 ++ stage: build ++ tags: ++ - debian ++ - amd64 ++ script: ++ - meson build --buildtype release ++ --werror ++ --libdir lib ++ --prefix "$(pwd)/build/dav1d_install" ++ --cross-file /opt/crossfiles/armv7-w64-mingw32.meson ++ -Ddefault_library=both ++ - ninja -C build ++ ++build-win-arm64: ++ image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190218133533 ++ stage: build ++ tags: ++ - debian ++ - amd64 ++ script: ++ - meson build --buildtype release ++ --werror ++ --libdir lib ++ --prefix "$(pwd)/build/dav1d_install" ++ --cross-file /opt/crossfiles/aarch64-w64-mingw32.meson ++ -Ddefault_library=both ++ - ninja -C build ++ - ninja -C build install + artifacts: + name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" + paths: +@@ -129,14 +201,57 @@ build-debian-werror: + - env CC='clang-7' meson build --buildtype debug --werror + - ninja -C build + ++build-debian-armv7: ++ stage: build ++ image: registry.videolan.org:5000/dav1d-debian-unstable-armv7:20190202101732 ++ tags: ++ - armv7 ++ - debian ++ script: ++ - meson build --buildtype release --werror ++ - ninja -C build ++ - cd build && meson test -v ++ ++build-debian-armv7-clang-5: ++ stage: build ++ image: registry.videolan.org:5000/dav1d-debian-unstable-armv7:20190202101732 ++ tags: ++ - armv7 ++ - debian ++ script: ++ - env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release ++ - ninja -C build ++ - cd build && meson test -v ++ ++build-ubuntu-snap: ++ stage: build ++ image: registry.videolan.org:5000/dav1d-ubuntu-bionic:20190221154127 ++ tags: ++ - debian ++ - amd64 ++ script: ++ - snapcraft snap ++ - | ++ if [ "$CI_PROJECT_NAMESPACE" = "videolan" ]; then ++ echo $SNAP_LOGIN | base64 --decode | snapcraft login --with - ++ snapcraft push dav1d_*.snap --release edge ++ snapcraft logout ++ fi ++ artifacts: ++ name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" ++ paths: ++ - dav1d_*.snap ++ expire_in: 1 week ++ allow_failure: true ++ + test-debian: +- image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132 ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 + stage: test + tags: + - debian + - amd64 + cache: +- key: testdata.git ++ key: testdata.git-20190215 + paths: + - cache/dav1d-test-data.git/ + script: +@@ -144,19 +259,19 @@ test-debian: + - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master + - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git + - git clone cache/dav1d-test-data.git tests/dav1d-test-data +- - meson build --buildtype release -Dtestdata_tests=true ++ - meson build --buildtype release -Dtestdata_tests=true -Dlogging=false + - ninja -C build + - cd build && time meson test -v + dependencies: [] + + test-debian-asan: +- image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132 ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 + stage: test + tags: + - debian + - amd64 + cache: +- key: testdata.git ++ key: testdata.git-20190215 + paths: + - cache/dav1d-test-data.git/ + variables: +@@ -166,19 +281,19 @@ test-debian-asan: + - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master + - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git + - git clone cache/dav1d-test-data.git tests/dav1d-test-data +- - meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=address -Dbuild_asm=false ++ - meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=address -Dbuild_asm=false + - ninja -C build + - cd build && time meson test -v --setup=sanitizer + dependencies: [] + + test-debian-msan: +- image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132 ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 + stage: test + tags: + - debian + - amd64 + cache: +- key: testdata.git ++ key: testdata.git-20190215 + paths: + - cache/dav1d-test-data.git/ + variables: +@@ -188,19 +303,19 @@ test-debian-msan: + - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master + - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git + - git clone cache/dav1d-test-data.git tests/dav1d-test-data +- - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=memory -Db_lundef=false -Dbuild_asm=false ++ - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=memory -Db_lundef=false -Dbuild_asm=false + - ninja -C build + - cd build && time meson test -v --setup=sanitizer + dependencies: [] + + test-debian-ubsan: +- image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132 ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 + stage: test + tags: + - debian + - amd64 + cache: +- key: testdata.git ++ key: testdata.git-20190215 + paths: + - cache/dav1d-test-data.git/ + variables: +@@ -210,7 +325,31 @@ test-debian-ubsan: + - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master + - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git + - git clone cache/dav1d-test-data.git tests/dav1d-test-data +- - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=undefined -Db_lundef=false -Dbuild_asm=false ++ - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=undefined -Db_lundef=false -Dbuild_asm=false + - ninja -C build + - cd build && time meson test -v --setup=sanitizer + dependencies: [] ++ ++test-win64: ++ image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514 ++ stage: test ++ tags: ++ - debian ++ - amd64 ++ cache: ++ key: testdata.git-20190215 ++ paths: ++ - cache/dav1d-test-data.git/ ++ script: ++ - test -d cache || mkdir cache ++ - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master ++ - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git ++ - git clone cache/dav1d-test-data.git tests/dav1d-test-data ++ - wineserver -p && wine wineboot ++ - meson build --buildtype release ++ -Dtestdata_tests=true ++ -Dlogging=false ++ --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson ++ - ninja -C build ++ - cd build && time meson test -v ++ dependencies: [] +diff --git third_party/dav1d/COPYING third_party/dav1d/COPYING +index 0fe9207ac4cb..875b138ecf6f 100644 +--- third_party/dav1d/COPYING ++++ third_party/dav1d/COPYING +@@ -1,4 +1,4 @@ +-Copyright © 2018, VideoLAN and dav1d authors ++Copyright © 2018-2019, VideoLAN and dav1d authors + All rights reserved. + + Redistribution and use in source and binary forms, with or without +diff --git third_party/dav1d/NEWS third_party/dav1d/NEWS +index f1e690d0f2a8..b3e31cabcae6 100644 +--- third_party/dav1d/NEWS ++++ third_party/dav1d/NEWS +@@ -1,9 +1,27 @@ +-Changes for 0.1.1 'Gazelle': ++Changes for 0.2.2 'Antelope': + ---------------------------- + ++ ++Changes for 0.2.1 'Antelope': ++---------------------------- ++ ++ - SSSE3 optimization for cdef_dir ++ - AVX-2 improvements of the existing CDEF optimizations ++ - NEON improvements of the existing CDEF and wiener optimizations ++ - Clarification about the numbering/versionning scheme ++ ++ ++Changes for 0.2.0 'Antelope': ++---------------------------- ++ ++ - ARM64 and ARM optimizations using NEON instructions ++ - SSSE3 optimizations for both 32 and 64bits ++ - More AVX-2 assembly, reaching almost completion + - Fix installation of includes + - Rewrite inverse transforms to avoid overflows +- - More AVX-2 assembly ++ - Snap packaging for Linux ++ - Updated API (ABI and API break) ++ - Fixes for un-decodable samples + + + Changes for 0.1.0 'Gazelle': +diff --git third_party/dav1d/README.md third_party/dav1d/README.md +index d69b49a1a812..bb8f7deb55df 100644 +--- third_party/dav1d/README.md ++++ third_party/dav1d/README.md +@@ -35,7 +35,7 @@ The plan is the folllowing: + 6. Make it fast on older desktop, by writing asm for SSE chips. + + ### After +-7. Improve C code base with [various tweaks](wiki/task-list), ++7. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list), + 8. Accelerate for less common architectures, + 9. Use more GPU, when possible. + +@@ -70,7 +70,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr + + # Compile + +-1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86* targets, [nasm](https://nasm.us/) (2.13 or higher) ++1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher) + 2. Run `meson build --buildtype release` + 3. Build with `ninja -C build` + +@@ -88,6 +88,10 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr + git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data + ``` + 2. During initial build dir setup or `meson configure` specify `-Dbuild_tests=true` and `-Dtestdata_tests=true` ++ ++ ``` ++ meson .test -Dbuild_tests=true -Dtestdata_tests=true ++ ``` + 3. In the build directory run `meson test` optionally with `-v` for more verbose output + + # Support +diff --git third_party/dav1d/THANKS.md third_party/dav1d/THANKS.md +index 14ca7c093cb5..cba0537f8a88 100644 +--- third_party/dav1d/THANKS.md ++++ third_party/dav1d/THANKS.md +@@ -16,4 +16,4 @@ The Alliance for Open Media (AOM) for funding this project. + + And all the dav1d Authors (git shortlog -sn), including: + +-Janne Grunau, Ronald S. Bultje, James Almer, Marvin Scholz, Henrik Gramner, Martin Storsjö, Luc Trudeau, David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Jean-Baptiste Kempf, Derek Buitenhuis, Nathan E. Egge, Raphaël Zumer, Francois Cartegnie, Niklas Haas, Konstantin Pavlov, Boyuan Xiao, Raphael Zumer and Michael Bradshaw. ++Janne Grunau, Ronald S. Bultje, Martin Storsjö, James Almer, Henrik Gramner, Marvin Scholz, Luc Trudeau, David Michael Barr, Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Steve Lhomme, Francois Cartegnie, Konstantin Pavlov, Nathan E. Egge, Victorien Le Couviour--Tuffet, Derek Buitenhuis, Liwei Wang, Raphaël Zumer, Michael Bradshaw, Niklas Haas, Xuefeng Jiang, Boyuan Xiao, Kyle Siefring, Matthias Dressel, Rupert Swarbrick, Thierry Foucu, Thomas Daede, Jan Beich, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, Dale Curtis, Fred Barbier, Jean-Yves Avenard, Luca Barbato, Mark Shuttleworth, Nicolas Frattaroli, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Timo Gurr and skal. +diff --git third_party/dav1d/include/common/attributes.h third_party/dav1d/include/common/attributes.h +index 25e7b7eb359b..a9fd0faf23c2 100644 +--- third_party/dav1d/include/common/attributes.h ++++ third_party/dav1d/include/common/attributes.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_COMMON_ATTRIBUTES_H__ +-#define __DAV1D_COMMON_ATTRIBUTES_H__ ++#ifndef DAV1D_COMMON_ATTRIBUTES_H ++#define DAV1D_COMMON_ATTRIBUTES_H + + #include "config.h" + +@@ -34,8 +34,10 @@ + + #ifdef __GNUC__ + #define ATTR_ALIAS __attribute__((may_alias)) ++#define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr))); + #else + #define ATTR_ALIAS ++#define ATTR_FORMAT_PRINTF(fmt, attr) + #endif + + #if ARCH_X86_64 +@@ -137,4 +139,4 @@ static inline int clzll(const unsigned long long mask) { + } + #endif /* !_MSC_VER */ + +-#endif /* __DAV1D_COMMON_ATTRIBUTES_H__ */ ++#endif /* DAV1D_COMMON_ATTRIBUTES_H */ +diff --git third_party/dav1d/include/common/bitdepth.h third_party/dav1d/include/common/bitdepth.h +index ad4b3ab9cc8d..f13d8ca14f62 100644 +--- third_party/dav1d/include/common/bitdepth.h ++++ third_party/dav1d/include/common/bitdepth.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_COMMON_BITDEPTH_H__ +-#define __DAV1D_COMMON_BITDEPTH_H__ 1 ++#ifndef DAV1D_COMMON_BITDEPTH_H ++#define DAV1D_COMMON_BITDEPTH_H + + #include + #include +@@ -77,4 +77,4 @@ static inline void pixel_set(pixel *const dst, const int val, const int num) { + name##_8bpc(__VA_ARGS__); \ + name##_16bpc(__VA_ARGS__) + +-#endif /* __DAV1D_COMMON_BITDEPTH_H__ */ ++#endif /* DAV1D_COMMON_BITDEPTH_H */ +diff --git third_party/dav1d/include/common/dump.h third_party/dav1d/include/common/dump.h +index e76fe1fe7066..4d2b1bae7cc4 100644 +--- third_party/dav1d/include/common/dump.h ++++ third_party/dav1d/include/common/dump.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_COMMON_DUMP_H__ +-#define __DAV1D_COMMON_DUMP_H__ ++#ifndef DAV1D_COMMON_DUMP_H ++#define DAV1D_COMMON_DUMP_H + + #include + #include +@@ -83,4 +83,4 @@ static inline void ac_dump(const int16_t *buf, int w, int h, const char *what) + } + } + +-#endif /* __DAV1D_COMMON_DUMP_H__ */ ++#endif /* DAV1D_COMMON_DUMP_H */ +diff --git third_party/dav1d/include/common/intops.h third_party/dav1d/include/common/intops.h +index 62c46afcd179..119caab6b8a5 100644 +--- third_party/dav1d/include/common/intops.h ++++ third_party/dav1d/include/common/intops.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_COMMON_INTOPS_H__ +-#define __DAV1D_COMMON_INTOPS_H__ ++#ifndef DAV1D_COMMON_INTOPS_H ++#define DAV1D_COMMON_INTOPS_H + + #include + +@@ -73,4 +73,4 @@ static inline unsigned inv_recenter(const unsigned r, const unsigned v) { + return r - ((v + 1) >> 1); + } + +-#endif /* __DAV1D_COMMON_INTOPS_H__ */ ++#endif /* DAV1D_COMMON_INTOPS_H */ +diff --git third_party/dav1d/include/common/mem.h third_party/dav1d/include/common/mem.h +index d68639fae3dc..3fbcdbca1819 100644 +--- third_party/dav1d/include/common/mem.h ++++ third_party/dav1d/include/common/mem.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_COMMON_MEM_H__ +-#define __DAV1D_COMMON_MEM_H__ ++#ifndef DAV1D_COMMON_MEM_H ++#define DAV1D_COMMON_MEM_H + + #include + #include +@@ -80,4 +80,4 @@ static inline void freep(void *ptr) { + } + } + +-#endif /* __DAV1D_COMMON_MEM_H__ */ ++#endif /* DAV1D_COMMON_MEM_H */ +diff --git third_party/dav1d/include/common/validate.h third_party/dav1d/include/common/validate.h +index 91147406f84a..3096f3db8ee1 100644 +--- third_party/dav1d/include/common/validate.h ++++ third_party/dav1d/include/common/validate.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_COMMON_VALIDATE_H__ +-#define __DAV1D_COMMON_VALIDATE_H__ ++#ifndef DAV1D_COMMON_VALIDATE_H ++#define DAV1D_COMMON_VALIDATE_H + + #include + #include +@@ -56,4 +56,4 @@ + + #define validate_input(x) validate_input_or_ret(x, ) + +-#endif /* __DAV1D_COMMON_VALIDATE_H__ */ ++#endif /* DAV1D_COMMON_VALIDATE_H */ +diff --git third_party/dav1d/include/dav1d/common.h third_party/dav1d/include/dav1d/common.h +index 4fd6df3e5c55..5223f1e86241 100644 +--- third_party/dav1d/include/dav1d/common.h ++++ third_party/dav1d/include/dav1d/common.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_COMMON_H__ +-#define __DAV1D_COMMON_H__ ++#ifndef DAV1D_COMMON_H ++#define DAV1D_COMMON_H + + #include + #include +@@ -67,4 +67,4 @@ typedef struct Dav1dDataProps { + struct Dav1dUserData user_data; ///< user-configurable data, default NULL members + } Dav1dDataProps; + +-#endif // __DAV1D_COMMON_H__ ++#endif /* DAV1D_COMMON_H */ +diff --git third_party/dav1d/include/dav1d/data.h third_party/dav1d/include/dav1d/data.h +index 63f21fd9a5fd..09a03a1f407c 100644 +--- third_party/dav1d/include/dav1d/data.h ++++ third_party/dav1d/include/dav1d/data.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_DATA_H__ +-#define __DAV1D_DATA_H__ ++#ifndef DAV1D_DATA_H ++#define DAV1D_DATA_H + + #include + #include +@@ -37,7 +37,7 @@ typedef struct Dav1dData { + const uint8_t *data; ///< data pointer + size_t sz; ///< data size + struct Dav1dRef *ref; ///< allocation origin +- Dav1dDataProps m; ++ Dav1dDataProps m; ///< user provided metadata passed to the output picture + } Dav1dData; + + /** +@@ -106,4 +106,4 @@ DAV1D_API int dav1d_data_wrap_user_data(Dav1dData *data, + */ + DAV1D_API void dav1d_data_unref(Dav1dData *data); + +-#endif /* __DAV1D_DATA_H__ */ ++#endif /* DAV1D_DATA_H */ +diff --git third_party/dav1d/include/dav1d/dav1d.h third_party/dav1d/include/dav1d/dav1d.h +index 15365e1834bd..5f7db465be90 100644 +--- third_party/dav1d/include/dav1d/dav1d.h ++++ third_party/dav1d/include/dav1d/dav1d.h +@@ -25,18 +25,20 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_H__ +-#define __DAV1D_H__ ++#ifndef DAV1D_H ++#define DAV1D_H + + #ifdef __cplusplus + extern "C" { + #endif + + #include ++#include + + #include "common.h" + #include "picture.h" + #include "data.h" ++#include "version.h" + + typedef struct Dav1dContext Dav1dContext; + typedef struct Dav1dRef Dav1dRef; +@@ -44,13 +46,26 @@ typedef struct Dav1dRef Dav1dRef; + #define DAV1D_MAX_FRAME_THREADS 256 + #define DAV1D_MAX_TILE_THREADS 64 + ++typedef struct Dav1dLogger { ++ void *cookie; ///< Custom data to pass to the callback. ++ /** ++ * Logger callback. Default prints to stderr. May be NULL to disable logging. ++ * ++ * @param cookie Custom pointer passed to all calls. ++ * @param format The vprintf compatible format string. ++ * @param ap List of arguments referenced by the format string. ++ */ ++ void (*callback)(void *cookie, const char *format, va_list ap); ++} Dav1dLogger; ++ + typedef struct Dav1dSettings { + int n_frame_threads; + int n_tile_threads; +- Dav1dPicAllocator allocator; + int apply_grain; + int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31) + int all_layers; ///< output all spatial layers of a scalable AV1 biststream ++ Dav1dPicAllocator allocator; ++ Dav1dLogger logger; + } Dav1dSettings; + + /** +@@ -187,4 +202,4 @@ DAV1D_API void dav1d_flush(Dav1dContext *c); + } + # endif + +-#endif /* __DAV1D_H__ */ ++#endif /* DAV1D_H */ +diff --git third_party/dav1d/include/dav1d/headers.h third_party/dav1d/include/dav1d/headers.h +index f50a82ea3b06..2e0c67f5ac1c 100644 +--- third_party/dav1d/include/dav1d/headers.h ++++ third_party/dav1d/include/dav1d/headers.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_HEADERS_H__ +-#define __DAV1D_HEADERS_H__ ++#ifndef DAV1D_HEADERS_H ++#define DAV1D_HEADERS_H + + // Constants from Section 3. "Symbols and abbreviated terms" + #define DAV1D_MAX_CDEF_STRENGTHS 8 +@@ -160,6 +160,22 @@ enum Dav1dChromaSamplePosition { + DAV1D_CHR_COLOCATED = 2, ///< Co-located with luma(0, 0) sample + }; + ++typedef struct Dav1dContentLightLevel { ++ int max_content_light_level; ++ int max_frame_average_light_level; ++} Dav1dContentLightLevel; ++ ++typedef struct Dav1dMasteringDisplay { ++ ///< 0.16 fixed point ++ uint16_t primaries[3][2]; ++ ///< 0.16 fixed point ++ uint16_t white_point[2]; ++ ///< 24.8 fixed point ++ uint32_t max_luminance; ++ ///< 18.14 fixed point ++ uint32_t min_luminance; ++} Dav1dMasteringDisplay; ++ + typedef struct Dav1dSequenceHeader { + /** + * Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome; +@@ -178,6 +194,14 @@ typedef struct Dav1dSequenceHeader { + enum Dav1dTransferCharacteristics trc; ///< transfer characteristics (av1) + enum Dav1dMatrixCoefficients mtrx; ///< matrix coefficients (av1) + enum Dav1dChromaSamplePosition chr; ///< chroma sample position (av1) ++ /** ++ * 0, 1 and 2 mean 8, 10 or 12 bits/component, respectively. This is not ++ * exactly the same as 'hbd' from the spec; the spec's hbd distinguishes ++ * between 8 (0) and 10-12 (1) bits/component, and another element ++ * (twelve_bit) to distinguish between 10 and 12 bits/component. To get ++ * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2. ++ */ ++ int hbd; + /** + * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of + * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma). +@@ -191,9 +215,6 @@ typedef struct Dav1dSequenceHeader { + int idc; + int tier; + int decoder_model_param_present; +- int decoder_buffer_delay; +- int encoder_buffer_delay; +- int low_delay_mode; + int display_model_param_present; + } operating_points[DAV1D_MAX_OPERATING_POINTS]; + +@@ -230,18 +251,22 @@ typedef struct Dav1dSequenceHeader { + int super_res; + int cdef; + int restoration; +- /** +- * 0, 1 and 2 mean 8, 10 or 12 bits/component, respectively. This is not +- * exactly the same as 'hbd' from the spec; the spec's hbd distinguishes +- * between 8 (0) and 10-12 (1) bits/component, and another element +- * (twelve_bit) to distinguish between 10 and 12 bits/component. To get +- * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2. +- */ +- int hbd; + int ss_hor, ss_ver, monochrome; + int color_description_present; + int separate_uv_delta_q; + int film_grain_present; ++ ++ // Dav1dSequenceHeaders of the same sequence are required to be ++ // bit-identical until this offset. See 7.5 "Ordering of OBUs": ++ // Within a particular coded video sequence, the contents of ++ // sequence_header_obu must be bit-identical each time the ++ // sequence header appears except for the contents of ++ // operating_parameters_info. ++ struct Dav1dSequenceHeaderOperatingParameterInfo { ++ int decoder_buffer_delay; ++ int encoder_buffer_delay; ++ int low_delay_mode; ++ } operating_parameter_info[DAV1D_MAX_OPERATING_POINTS]; + } Dav1dSequenceHeader; + + typedef struct Dav1dSegmentationData { +@@ -382,4 +407,4 @@ typedef struct Dav1dFrameHeader { + Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME]; + } Dav1dFrameHeader; + +-#endif /* __DAV1D_HEADERS_H__ */ ++#endif /* DAV1D_HEADERS_H */ +diff --git third_party/dav1d/include/dav1d/meson.build third_party/dav1d/include/dav1d/meson.build +new file mode 100644 +index 000000000000..b5649d398ba3 +--- /dev/null ++++ third_party/dav1d/include/dav1d/meson.build +@@ -0,0 +1,41 @@ ++# Copyright © 2019, VideoLAN and dav1d authors ++# All rights reserved. ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are met: ++# ++# 1. Redistributions of source code must retain the above copyright notice, this ++# list of conditions and the following disclaimer. ++# ++# 2. Redistributions in binary form must reproduce the above copyright notice, ++# this list of conditions and the following disclaimer in the documentation ++# and/or other materials provided with the distribution. ++# ++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++# installed version.h header generation ++version_h_data = configuration_data() ++version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major) ++version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor) ++version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision) ++version_h_target = configure_file(input: 'version.h.in', ++ output: 'version.h', ++ configuration: version_h_data) ++ ++# install headers ++install_headers('common.h', ++ 'data.h', ++ 'dav1d.h', ++ 'headers.h', ++ 'picture.h', ++ version_h_target, ++ subdir : 'dav1d') +diff --git third_party/dav1d/include/dav1d/picture.h third_party/dav1d/include/dav1d/picture.h +index 7627f4d63345..fee0ec8c46d1 100644 +--- third_party/dav1d/include/dav1d/picture.h ++++ third_party/dav1d/include/dav1d/picture.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_PICTURE_H__ +-#define __DAV1D_PICTURE_H__ ++#ifndef DAV1D_PICTURE_H ++#define DAV1D_PICTURE_H + + #include + #include +@@ -34,6 +34,11 @@ + #include "common.h" + #include "headers.h" + ++/* Number of bytes to align AND pad picture memory buffers by, so that SIMD ++ * implementations can over-read by a few bytes, and use aligned read/write ++ * instructions. */ ++#define DAV1D_PICTURE_ALIGNMENT 32 ++ + typedef struct Dav1dPictureParameters { + int w; ///< width (in pixels) + int h; ///< height (in pixels) +@@ -61,7 +66,21 @@ typedef struct Dav1dPicture { + + Dav1dPictureParameters p; + Dav1dDataProps m; +- struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref, *ref; ///< allocation origins ++ ++ /** ++ * High Dynamic Range Content Light Level metadata applying to this picture, ++ * as defined in section 5.8.3 and 6.7.3 ++ */ ++ Dav1dContentLightLevel *content_light; ++ /** ++ * High Dynamic Range Mastering Display Color Volume metadata applying to ++ * this picture, as defined in section 5.8.4 and 6.7.4 ++ */ ++ Dav1dMasteringDisplay *mastering_display; ++ ++ struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref; ///< Frame parameter allocation origins ++ struct Dav1dRef *content_light_ref, *mastering_display_ref; ///< Metadata allocation origins ++ struct Dav1dRef *ref; ///< Frame data allocation origin + + void *allocator_data; ///< pointer managed by the allocator + } Dav1dPicture; +@@ -71,8 +90,10 @@ typedef struct Dav1dPicAllocator { + /** + * Allocate the picture buffer based on the Dav1dPictureParameters. + * +- * The data[0], data[1] and data[2] must be 32 byte aligned and with a +- * pixel width/height multiple of 128 pixels. ++ * The data[0], data[1] and data[2] must be DAV1D_PICTURE_ALIGNMENT byte ++ * aligned and with a pixel width/height multiple of 128 pixels. Any ++ * allocated memory area should also be padded by DAV1D_PICTURE_ALIGNMENT ++ * bytes. + * data[1] and data[2] must share the same stride[1]. + * + * This function will be called on the main thread (the thread which calls +@@ -85,8 +106,10 @@ typedef struct Dav1dPicAllocator { + * a custom pointer that will be passed to + * release_picture_callback(). + * @param cookie Custom pointer passed to all calls. +- * +- * @return 0 on success. A negative errno value on error. ++ * ++ * @note No fields other than data, stride and allocator_data must be filled ++ * by this callback. ++ * @return 0 on success. A negative errno value on error. + */ + int (*alloc_picture_callback)(Dav1dPicture *pic, void *cookie); + /** +@@ -108,4 +131,4 @@ typedef struct Dav1dPicAllocator { + */ + DAV1D_API void dav1d_picture_unref(Dav1dPicture *p); + +-#endif /* __DAV1D_PICTURE_H__ */ ++#endif /* DAV1D_PICTURE_H */ +diff --git third_party/dav1d/include/dav1d/version.h.in third_party/dav1d/include/dav1d/version.h.in +new file mode 100644 +index 000000000000..30bfd11427d7 +--- /dev/null ++++ third_party/dav1d/include/dav1d/version.h.in +@@ -0,0 +1,34 @@ ++/* ++ * Copyright © 2019, VideoLAN and dav1d authors ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright notice, this ++ * list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above copyright notice, ++ * this list of conditions and the following disclaimer in the documentation ++ * and/or other materials provided with the distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef DAV1D_VERSION_H ++#define DAV1D_VERSION_H ++ ++#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@ ++#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@ ++#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@ ++ ++#endif /* DAV1D_VERSION_H */ +diff --git third_party/dav1d/include/meson.build third_party/dav1d/include/meson.build +index 7de8d397da2e..c83bfcd534d4 100644 +--- third_party/dav1d/include/meson.build ++++ third_party/dav1d/include/meson.build +@@ -22,16 +22,15 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-# Revision file (version.h) generation ++# Revision file (vcs_version.h) generation + dav1d_git_dir = join_paths(dav1d_src_root, '.git') + rev_target = vcs_tag(command: [ + 'git', '--git-dir', dav1d_git_dir, + 'describe', '--tags', '--long', + '--match', '?.*.*', '--always' + ], +- input: 'version.h.in', +- output: 'version.h' ++ input: 'vcs_version.h.in', ++ output: 'vcs_version.h' + ) + +-# Install include/dav1d headers +-install_subdir('dav1d', install_dir: get_option('includedir')) ++subdir('dav1d') +diff --git third_party/dav1d/include/vcs_version.h.in third_party/dav1d/include/vcs_version.h.in +new file mode 100644 +index 000000000000..71ed2f6982c3 +--- /dev/null ++++ third_party/dav1d/include/vcs_version.h.in +@@ -0,0 +1,2 @@ ++/* auto-generated, do not edit */ ++#define DAV1D_VERSION "@VCS_TAG@" +diff --git third_party/dav1d/include/version.h.in third_party/dav1d/include/version.h.in +deleted file mode 100644 +index 71ed2f6982c3..000000000000 +--- third_party/dav1d/include/version.h.in ++++ /dev/null +@@ -1,2 +0,0 @@ +-/* auto-generated, do not edit */ +-#define DAV1D_VERSION "@VCS_TAG@" +diff --git third_party/dav1d/meson.build third_party/dav1d/meson.build +index 7512023b891f..96b7b1f6ffc0 100644 +--- third_party/dav1d/meson.build ++++ third_party/dav1d/meson.build +@@ -1,4 +1,4 @@ +-# Copyright © 2018, VideoLAN and dav1d authors ++# Copyright © 2018-2019, VideoLAN and dav1d authors + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without +@@ -23,17 +23,18 @@ + # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + project('dav1d', ['c'], +- version: '0.1.1', ++ version: '0.2.2', + default_options: ['c_std=c99', + 'warning_level=2', + 'buildtype=release', + 'b_ndebug=if-release'], + meson_version: '>= 0.47.0') + +-dav1d_version_array = meson.project_version().split('.') +-dav1d_version_major = dav1d_version_array[0] +-dav1d_version_minor = dav1d_version_array[1] +-dav1d_version_revision = dav1d_version_array[2] ++dav1d_soname_version = '1.0.1' ++dav1d_api_version_array = dav1d_soname_version.split('.') ++dav1d_api_version_major = dav1d_api_version_array[0] ++dav1d_api_version_minor = dav1d_api_version_array[1] ++dav1d_api_version_revision = dav1d_api_version_array[2] + + dav1d_src_root = meson.current_source_dir() + cc = meson.get_compiler('c') +@@ -45,7 +46,7 @@ cdata = configuration_data() + cdata_asm = configuration_data() + + # Include directories +-dav1d_inc_dirs = include_directories(['.', 'include', 'include/dav1d']) ++dav1d_inc_dirs = include_directories(['.', 'include/dav1d', 'include']) + + + +@@ -70,6 +71,8 @@ if is_asm_enabled and get_option('b_sanitize') == 'memory' + error('asm causes false positive with memory sanitizer. Use \'-Dbuild_asm=false\'.') + endif + ++# Logging option ++cdata.set10('CONFIG_LOG', get_option('logging')) + + # + # OS/Compiler checks and defines +@@ -87,6 +90,12 @@ if host_machine.system() == 'windows' + cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs + cdata.set('_UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs + cdata.set('__USE_MINGW_ANSI_STDIO', 1) # Define to force use of MinGW printf ++ if cc.has_function('fseeko', prefix : '#include ', args : test_args) ++ cdata.set('_FILE_OFFSET_BITS', 64) # Not set by default by Meson on Windows ++ else ++ cdata.set('fseeko', '_fseeki64') ++ cdata.set('ftello', '_ftelli64') ++ endif + endif + + # On Windows, we use a compatibility layer to emulate pthread +@@ -122,6 +131,10 @@ if cc.check_header('unistd.h') + cdata.set('HAVE_UNISTD_H', 1) + endif + ++if cc.check_header('io.h') ++ cdata.set('HAVE_IO_H', 1) ++endif ++ + + # Function checks + +@@ -167,8 +180,10 @@ optional_arguments = [ + '-Wundef', + '-Werror=vla', + '-Wno-maybe-uninitialized', ++ '-Wno-missing-field-initializers', + '-Wno-unused-parameter', + '-Werror=missing-prototypes', ++ '-Wshorten-64-to-32', + ] + if cc.get_id() == 'msvc' + optional_arguments += [ +@@ -199,8 +214,12 @@ endif + stackalign_flag = [] + stackrealign_flag = [] + ++cdata.set10('ENDIANNESS_BIG', host_machine.endian() == 'big') ++ + if host_machine.cpu_family().startswith('x86') +- if host_machine.cpu_family() == 'x86_64' ++ if get_option('stack_alignment') > 0 ++ stack_alignment = get_option('stack_alignment') ++ elif host_machine.cpu_family() == 'x86_64' + if cc.has_argument('-mpreferred-stack-boundary=5') + stackalign_flag = ['-mpreferred-stack-boundary=5'] + stackrealign_flag = ['-mincoming-stack-boundary=4'] +@@ -311,8 +330,8 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86') + nasm_r = run_command(nasm, '-v') + out = nasm_r.stdout().strip().split() + if out[1].to_lower() == 'version' +- if out[2].version_compare('<2.13') +- error('nasm 2.13 or later is required, found nasm @0@'.format(out[2])) ++ if out[2].version_compare('<2.13.02') ++ error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2])) + endif + else + error('unexpected nasm version string: @0@'.format(nasm_r.stdout())) +diff --git third_party/dav1d/meson_options.txt third_party/dav1d/meson_options.txt +index b8e0d502d6ee..fe9112cda09a 100644 +--- third_party/dav1d/meson_options.txt ++++ third_party/dav1d/meson_options.txt +@@ -20,6 +20,11 @@ option('build_tests', + value: true, + description: 'Build dav1d tests') + ++option('logging', ++ type: 'boolean', ++ value: true, ++ description: 'Print error log messages using the provided callback function') ++ + option('testdata_tests', + type: 'boolean', + value: false, +@@ -30,3 +35,7 @@ option('fuzzing_engine', + choices : ['none', 'libfuzzer', 'oss-fuzz'], + value: 'none', + description: 'Select the fuzzing engine') ++ ++option('stack_alignment', ++ type: 'integer', ++ value: 0) +diff --git third_party/dav1d/snap/snapcraft.yaml third_party/dav1d/snap/snapcraft.yaml +new file mode 100644 +index 000000000000..d2e99e5be5fe +--- /dev/null ++++ third_party/dav1d/snap/snapcraft.yaml +@@ -0,0 +1,24 @@ ++name: dav1d ++base: core18 ++version: git ++version-script: git describe HEAD --always ++summary: AV1 decoder from VideoLAN ++description: | ++ A small and fast AV1 decoder from the people who brought you VLC. ++ ++grade: devel # must be 'stable' to release into candidate/stable channels ++confinement: strict # use 'strict' once you have the right plugs and slots ++ ++apps: ++ dav1d: ++ command: usr/bin/dav1d ++ plugs: [ 'home' ] ++ ++parts: ++ dav1d: ++ plugin: meson ++ source: . ++ build-packages: [ 'nasm' ] ++ meson-parameters: ++ - --prefix=/usr ++ - --buildtype=release +diff --git third_party/dav1d/src/arm/32/looprestoration.S third_party/dav1d/src/arm/32/looprestoration.S +new file mode 100644 +index 000000000000..bbf40d934009 +--- /dev/null ++++ third_party/dav1d/src/arm/32/looprestoration.S +@@ -0,0 +1,685 @@ ++/* ++ * Copyright © 2018, VideoLAN and dav1d authors ++ * Copyright © 2019, Martin Storsjo ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright notice, this ++ * list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above copyright notice, ++ * this list of conditions and the following disclaimer in the documentation ++ * and/or other materials provided with the distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "src/arm/asm.S" ++ ++// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4], ++// const pixel *src, ptrdiff_t stride, ++// const int16_t fh[7], const intptr_t w, ++// int h, enum LrEdgeFlags edges); ++function wiener_filter_h_neon, export=1 ++ push {r4-r11,lr} ++ vpush {q4} ++ ldrd r4, r5, [sp, #52] ++ ldrd r6, r7, [sp, #60] ++ mov r8, r5 ++ vld1.16 {q0}, [r4] ++ movw r9, #(1 << 14) - (1 << 2) ++ vdup.16 q14, r9 ++ vmov.s16 q15, #2048 ++ // Calculate mid_stride ++ add r10, r5, #7 ++ bic r10, r10, #7 ++ lsl r10, r10, #1 ++ ++ // Clear the last unused element of q0, to allow filtering a single ++ // pixel with one plain vmul+vpadd. ++ mov r12, #0 ++ vmov.16 d1[3], r12 ++ ++ // Set up pointers for reading/writing alternate rows ++ add r12, r0, r10 ++ lsl r10, r10, #1 ++ add lr, r2, r3 ++ lsl r3, r3, #1 ++ ++ // Subtract the width from mid_stride ++ sub r10, r10, r5, lsl #1 ++ ++ // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. ++ cmp r5, #8 ++ add r11, r5, #13 ++ bic r11, r11, #7 ++ bge 1f ++ mov r11, #16 ++1: ++ sub r3, r3, r11 ++ ++ // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL ++ tst r7, #1 // LR_HAVE_LEFT ++ beq 2f ++ // LR_HAVE_LEFT ++ cmp r1, #0 ++ bne 0f ++ // left == NULL ++ sub r2, r2, #3 ++ sub lr, lr, #3 ++ b 1f ++0: // LR_HAVE_LEFT, left != NULL ++2: // !LR_HAVE_LEFT, increase the stride. ++ // For this case we don't read the left 3 pixels from the src pointer, ++ // but shift it as if we had done that. ++ add r3, r3, #3 ++ ++ ++1: // Loop vertically ++ vld1.8 {q2}, [r2]! ++ vld1.8 {q9}, [lr]! ++ ++ tst r7, #1 // LR_HAVE_LEFT ++ beq 0f ++ cmp r1, #0 ++ beq 2f ++ // LR_HAVE_LEFT, left != NULL ++ vld1.32 {d3[1]}, [r1]! ++ // Move r2/lr back to account for the last 3 bytes we loaded earlier, ++ // which we'll shift out. ++ sub r2, r2, #3 ++ sub lr, lr, #3 ++ vld1.32 {d17[1]}, [r1]! ++ vext.8 q2, q1, q2, #13 ++ vext.8 q9, q8, q9, #13 ++ b 2f ++0: ++ // !LR_HAVE_LEFT, fill q1 with the leftmost byte ++ // and shift q2 to have 3x the first byte at the front. ++ vdup.8 q1, d4[0] ++ vdup.8 q8, d18[0] ++ // Move r2 back to account for the last 3 bytes we loaded before, ++ // which we shifted out. ++ sub r2, r2, #3 ++ sub lr, lr, #3 ++ vext.8 q2, q1, q2, #13 ++ vext.8 q9, q8, q9, #13 ++ ++2: ++ vmovl.u8 q1, d4 ++ vmovl.u8 q2, d5 ++ vmovl.u8 q8, d18 ++ vmovl.u8 q9, d19 ++ ++ tst r7, #2 // LR_HAVE_RIGHT ++ bne 4f ++ // If we'll need to pad the right edge, load that byte to pad with ++ // here since we can find it pretty easily from here. ++ sub r9, r5, #14 ++ ldrb r11, [r2, r9] ++ ldrb r9, [lr, r9] ++ // Fill q12/q13 with the right padding pixel ++ vdup.8 d24, r11 ++ vdup.8 d26, r9 ++ vmovl.u8 q12, d24 ++ vmovl.u8 q13, d26 ++3: // !LR_HAVE_RIGHT ++ // If we'll have to pad the right edge we need to quit early here. ++ cmp r5, #11 ++ bge 4f // If w >= 11, all used input pixels are valid ++ cmp r5, #7 ++ bge 5f // If w >= 7, we can filter 4 pixels ++ b 6f ++ ++4: // Loop horizontally ++.macro filter_8 ++ // This is tuned as some sort of compromise between Cortex A7, A8, ++ // A9 and A53. ++ vmul.s16 q3, q1, d0[0] ++ vext.8 q10, q1, q2, #2 ++ vext.8 q11, q1, q2, #4 ++ vmla.s16 q3, q10, d0[1] ++ vmla.s16 q3, q11, d0[2] ++ vext.8 q10, q1, q2, #6 ++ vext.8 q11, q1, q2, #8 ++ vmla.s16 q3, q10, d0[3] ++ vmla.s16 q3, q11, d1[0] ++ vext.8 q10, q1, q2, #10 ++ vext.8 q11, q1, q2, #12 ++ vmla.s16 q3, q10, d1[1] ++ vmla.s16 q3, q11, d1[2] ++ ++ vmul.s16 q10, q8, d0[0] ++ vext.8 q11, q8, q9, #2 ++ vext.8 q4, q8, q9, #4 ++ vmla.s16 q10, q11, d0[1] ++ vmla.s16 q10, q4, d0[2] ++ vext.8 q11, q8, q9, #6 ++ vext.8 q4, q8, q9, #8 ++ vmla.s16 q10, q11, d0[3] ++ vmla.s16 q10, q4, d1[0] ++ vext.8 q11, q8, q9, #10 ++ vext.8 q4, q8, q9, #12 ++ vmla.s16 q10, q11, d1[1] ++ vmla.s16 q10, q4, d1[2] ++ ++ vext.8 q1, q1, q2, #6 ++ vext.8 q8, q8, q9, #6 ++ vshl.s16 q1, q1, #7 ++ vshl.s16 q8, q8, #7 ++ vsub.s16 q1, q1, q14 ++ vsub.s16 q8, q8, q14 ++ vqadd.s16 q3, q3, q1 ++ vqadd.s16 q10, q10, q8 ++ vshr.s16 q3, q3, #3 ++ vshr.s16 q10, q10, #3 ++ vadd.s16 q3, q3, q15 ++ vadd.s16 q10, q10, q15 ++.endm ++ filter_8 ++ vst1.16 {q3}, [r0, :128]! ++ vst1.16 {q10}, [r12, :128]! ++ ++ subs r5, r5, #8 ++ ble 9f ++ tst r7, #2 // LR_HAVE_RIGHT ++ vmov q1, q2 ++ vmov q8, q9 ++ vld1.8 {d4}, [r2]! ++ vld1.8 {d18}, [lr]! ++ vmovl.u8 q2, d4 ++ vmovl.u8 q9, d18 ++ bne 4b // If we don't need to pad, just keep filtering. ++ b 3b // If we need to pad, check how many pixels we have left. ++ ++5: // Filter 4 pixels, 7 <= w < 11 ++.macro filter_4 ++ vmul.s16 d6, d2, d0[0] ++ vext.8 q10, q1, q2, #2 ++ vext.8 q11, q1, q2, #4 ++ vmla.s16 d6, d20, d0[1] ++ vmla.s16 d6, d22, d0[2] ++ vext.8 q10, q1, q2, #6 ++ vext.8 q11, q1, q2, #8 ++ vmla.s16 d6, d20, d0[3] ++ vmla.s16 d6, d22, d1[0] ++ vext.8 q10, q1, q2, #10 ++ vext.8 q11, q1, q2, #12 ++ vmla.s16 d6, d20, d1[1] ++ vmla.s16 d6, d22, d1[2] ++ ++ vmul.s16 d20, d16, d0[0] ++ vext.8 q11, q8, q9, #2 ++ vext.8 q4, q8, q9, #4 ++ vmla.s16 d20, d22, d0[1] ++ vmla.s16 d20, d8, d0[2] ++ vext.8 q11, q8, q9, #6 ++ vext.8 q4, q8, q9, #8 ++ vmla.s16 d20, d22, d0[3] ++ vmla.s16 d20, d8, d1[0] ++ vext.8 q11, q8, q9, #10 ++ vext.8 q4, q8, q9, #12 ++ vmla.s16 d20, d22, d1[1] ++ vmla.s16 d20, d8, d1[2] ++ ++ vext.8 q11, q1, q2, #6 ++ vshl.s16 d22, d22, #7 ++ vsub.s16 d22, d22, d28 ++ vqadd.s16 d6, d6, d22 ++ vext.8 q11, q8, q9, #6 ++ vshl.s16 d22, d22, #7 ++ vsub.s16 d22, d22, d28 ++ vqadd.s16 d20, d20, d22 ++ vshr.s16 d6, d6, #3 ++ vshr.s16 d20, d20, #3 ++ vadd.s16 d6, d6, d30 ++ vadd.s16 d20, d20, d30 ++.endm ++ filter_4 ++ vst1.16 {d6}, [r0, :64]! ++ vst1.16 {d20}, [r12, :64]! ++ ++ subs r5, r5, #4 // 3 <= w < 7 ++ vext.8 q1, q1, q2, #8 ++ vext.8 q2, q2, q2, #8 ++ vext.8 q8, q8, q9, #8 ++ vext.8 q9, q9, q9, #8 ++ ++6: // Pad the right edge and filter the last few pixels. ++ // w < 7, w+3 pixels valid in q1-q2 ++ cmp r5, #5 ++ blt 7f ++ bgt 8f ++ // w == 5, 8 pixels valid in q1, q2 invalid ++ vmov q2, q12 ++ vmov q9, q13 ++ b 88f ++ ++7: // 1 <= w < 5, 4-7 pixels valid in q1 ++ sub r9, r5, #1 ++ // w9 = (pixels valid - 4) ++ adr r11, L(variable_shift_tbl) ++ ldr r9, [r11, r9, lsl #2] ++ add r11, r11, r9 ++ vmov q2, q12 ++ vmov q9, q13 ++ bx r11 ++ ++ .align 2 ++L(variable_shift_tbl): ++ .word 44f - L(variable_shift_tbl) + CONFIG_THUMB ++ .word 55f - L(variable_shift_tbl) + CONFIG_THUMB ++ .word 66f - L(variable_shift_tbl) + CONFIG_THUMB ++ .word 77f - L(variable_shift_tbl) + CONFIG_THUMB ++ ++44: // 4 pixels valid in d2/d16, fill d3/d17 with padding. ++ vmov d3, d4 ++ vmov d17, d18 ++ b 88f ++ // Shift q1 right, shifting out invalid pixels, ++ // shift q1 left to the original offset, shifting in padding pixels. ++55: // 5 pixels valid ++ vext.8 q1, q1, q1, #10 ++ vext.8 q1, q1, q2, #6 ++ vext.8 q8, q8, q8, #10 ++ vext.8 q8, q8, q9, #6 ++ b 88f ++66: // 6 pixels valid ++ vext.8 q1, q1, q1, #12 ++ vext.8 q1, q1, q2, #4 ++ vext.8 q8, q8, q8, #12 ++ vext.8 q8, q8, q9, #4 ++ b 88f ++77: // 7 pixels valid ++ vext.8 q1, q1, q1, #14 ++ vext.8 q1, q1, q2, #2 ++ vext.8 q8, q8, q8, #14 ++ vext.8 q8, q8, q9, #2 ++ b 88f ++ ++8: // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2 ++ vext.8 q2, q2, q2, #2 ++ vext.8 q2, q2, q12, #14 ++ vext.8 q9, q9, q9, #2 ++ vext.8 q9, q9, q13, #14 ++ ++88: ++ // w < 7, q1-q2 padded properly ++ cmp r5, #4 ++ blt 888f ++ ++ // w >= 4, filter 4 pixels ++ filter_4 ++ vst1.16 {d6}, [r0, :64]! ++ vst1.16 {d20}, [r12, :64]! ++ subs r5, r5, #4 // 0 <= w < 4 ++ vext.8 q1, q1, q2, #8 ++ vext.8 q8, q8, q9, #8 ++ beq 9f ++888: // 1 <= w < 4, filter 1 pixel at a time ++ vmul.s16 q3, q1, q0 ++ vmul.s16 q10, q8, q0 ++ vpadd.s16 d6, d6, d7 ++ vpadd.s16 d7, d20, d21 ++ vdup.16 d24, d2[3] ++ vpadd.s16 d6, d6, d7 ++ vdup.16 d25, d16[3] ++ vpadd.s16 d6, d6, d6 ++ vtrn.16 d24, d25 ++ vshl.s16 d24, d24, #7 ++ vsub.s16 d24, d24, d28 ++ vqadd.s16 d6, d6, d24 ++ vshr.s16 d6, d6, #3 ++ vadd.s16 d6, d6, d30 ++ vst1.s16 {d6[0]}, [r0, :16]! ++ vst1.s16 {d6[1]}, [r12, :16]! ++ subs r5, r5, #1 ++ vext.8 q1, q1, q2, #2 ++ vext.8 q8, q8, q9, #2 ++ bgt 888b ++ ++9: ++ subs r6, r6, #2 ++ ble 0f ++ // Jump to the next row and loop horizontally ++ add r0, r0, r10 ++ add r12, r12, r10 ++ add r2, r2, r3 ++ add lr, lr, r3 ++ mov r5, r8 ++ b 1b ++0: ++ vpop {q4} ++ pop {r4-r11,pc} ++.purgem filter_8 ++.purgem filter_4 ++endfunc ++ ++// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride, ++// const int16_t *mid, int w, int h, ++// const int16_t fv[7], enum LrEdgeFlags edges, ++// ptrdiff_t mid_stride); ++function wiener_filter_v_neon, export=1 ++ push {r4-r7,lr} ++ ldrd r4, r5, [sp, #20] ++ ldrd r6, r7, [sp, #28] ++ mov lr, r4 ++ vmov.s16 q1, #0 ++ mov r12, #128 ++ vld1.16 {q0}, [r5] ++ vmov.s16 d2[3], r12 ++ vadd.s16 q0, q0, q1 ++ ++ // Calculate the number of rows to move back when looping vertically ++ mov r12, r4 ++ tst r6, #4 // LR_HAVE_TOP ++ beq 0f ++ sub r2, r2, r7, lsl #1 ++ add r12, r12, #2 ++0: ++ tst r6, #8 // LR_HAVE_BOTTOM ++ beq 1f ++ add r12, r12, #2 ++ ++1: // Start of horizontal loop; start one vertical filter slice. ++ // Load rows into q8-q11 and pad properly. ++ tst r6, #4 // LR_HAVE_TOP ++ vld1.16 {q8}, [r2, :128], r7 ++ beq 2f ++ // LR_HAVE_TOP ++ vld1.16 {q10}, [r2, :128], r7 ++ vmov q9, q8 ++ vld1.16 {q11}, [r2, :128], r7 ++ b 3f ++2: // !LR_HAVE_TOP ++ vmov q9, q8 ++ vmov q10, q8 ++ vmov q11, q8 ++ ++3: ++ cmp r4, #4 ++ blt 5f ++ // Start filtering normally; fill in q12-q14 with unique rows. ++ vld1.16 {q12}, [r2, :128], r7 ++ vld1.16 {q13}, [r2, :128], r7 ++ vld1.16 {q14}, [r2, :128], r7 ++ ++4: ++.macro filter compare ++ subs r4, r4, #1 ++ // Interleaving the mul/mla chains actually hurts performance ++ // significantly on Cortex A53, thus keeping mul/mla tightly ++ // chained like this. ++ vmull.s16 q2, d16, d0[0] ++ vmlal.s16 q2, d18, d0[1] ++ vmlal.s16 q2, d20, d0[2] ++ vmlal.s16 q2, d22, d0[3] ++ vmlal.s16 q2, d24, d1[0] ++ vmlal.s16 q2, d26, d1[1] ++ vmlal.s16 q2, d28, d1[2] ++ vmull.s16 q3, d17, d0[0] ++ vmlal.s16 q3, d19, d0[1] ++ vmlal.s16 q3, d21, d0[2] ++ vmlal.s16 q3, d23, d0[3] ++ vmlal.s16 q3, d25, d1[0] ++ vmlal.s16 q3, d27, d1[1] ++ vmlal.s16 q3, d29, d1[2] ++ vqrshrun.s32 d4, q2, #11 ++ vqrshrun.s32 d5, q3, #11 ++ vqmovun.s16 d4, q2 ++ vst1.8 {d4}, [r0], r1 ++.if \compare ++ cmp r4, #4 ++.else ++ ble 9f ++.endif ++ vmov q8, q9 ++ vmov q9, q10 ++ vmov q10, q11 ++ vmov q11, q12 ++ vmov q12, q13 ++ vmov q13, q14 ++.endm ++ filter 1 ++ blt 7f ++ vld1.16 {q14}, [r2, :128], r7 ++ b 4b ++ ++5: // Less than 4 rows in total; not all of q12-q13 are filled yet. ++ tst r6, #8 // LR_HAVE_BOTTOM ++ beq 6f ++ // LR_HAVE_BOTTOM ++ cmp r4, #2 ++ // We load at least 2 rows in all cases. ++ vld1.16 {q12}, [r2, :128], r7 ++ vld1.16 {q13}, [r2, :128], r7 ++ bgt 53f // 3 rows in total ++ beq 52f // 2 rows in total ++51: // 1 row in total, q11 already loaded, load edge into q12-q14. ++ vmov q13, q12 ++ b 8f ++52: // 2 rows in total, q11 already loaded, load q12 with content data ++ // and 2 rows of edge. ++ vld1.16 {q14}, [r2, :128], r7 ++ vmov q15, q14 ++ b 8f ++53: ++ // 3 rows in total, q11 already loaded, load q12 and q13 with content ++ // and 2 rows of edge. ++ vld1.16 {q14}, [r2, :128], r7 ++ vld1.16 {q15}, [r2, :128], r7 ++ vmov q1, q15 ++ b 8f ++ ++6: ++ // !LR_HAVE_BOTTOM ++ cmp r4, #2 ++ bgt 63f // 3 rows in total ++ beq 62f // 2 rows in total ++61: // 1 row in total, q11 already loaded, pad that into q12-q14. ++ vmov q12, q11 ++ vmov q13, q11 ++ vmov q14, q11 ++ b 8f ++62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. ++ vld1.16 {q12}, [r2, :128], r7 ++ vmov q13, q12 ++ vmov q14, q12 ++ vmov q15, q12 ++ b 8f ++63: ++ // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. ++ vld1.16 {q12}, [r2, :128], r7 ++ vld1.16 {q13}, [r2, :128], r7 ++ vmov q14, q13 ++ vmov q15, q13 ++ vmov q1, q13 ++ b 8f ++ ++7: ++ // All registers up to q13 are filled already, 3 valid rows left. ++ // < 4 valid rows left; fill in padding and filter the last ++ // few rows. ++ tst r6, #8 // LR_HAVE_BOTTOM ++ beq 71f ++ // LR_HAVE_BOTTOM; load 2 rows of edge. ++ vld1.16 {q14}, [r2, :128], r7 ++ vld1.16 {q15}, [r2, :128], r7 ++ vmov q1, q15 ++ b 8f ++71: ++ // !LR_HAVE_BOTTOM, pad 3 rows ++ vmov q14, q13 ++ vmov q15, q13 ++ vmov q1, q13 ++ ++8: // At this point, all registers up to q14-15,q1 are loaded with ++ // edge/padding (depending on how many rows are left). ++ filter 0 // This branches to 9f when done ++ vmov q14, q15 ++ vmov q15, q1 ++ b 8b ++ ++9: // End of one vertical slice. ++ subs r3, r3, #8 ++ ble 0f ++ // Move pointers back up to the top and loop horizontally. ++ mls r0, r1, lr, r0 ++ mls r2, r7, r12, r2 ++ add r0, r0, #8 ++ add r2, r2, #16 ++ mov r4, lr ++ b 1b ++ ++0: ++ pop {r4-r7,pc} ++.purgem filter ++endfunc ++ ++// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride, ++// const pixel *src, int w, int h); ++function copy_narrow_neon, export=1 ++ push {r4,lr} ++ ldr r4, [sp, #8] ++ adr r12, L(copy_narrow_tbl) ++ ldr r3, [r12, r3, lsl #2] ++ add r12, r12, r3 ++ bx r12 ++ ++ .align 2 ++L(copy_narrow_tbl): ++ .word 0 ++ .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB ++ .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB ++ .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB ++ .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB ++ .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB ++ .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB ++ .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB ++ ++10: ++ add r3, r0, r1 ++ lsl r1, r1, #1 ++18: ++ subs r4, r4, #8 ++ blt 110f ++ vld1.8 {d0}, [r2, :64]! ++ vst1.8 {d0[0]}, [r0], r1 ++ vst1.8 {d0[1]}, [r3], r1 ++ vst1.8 {d0[2]}, [r0], r1 ++ vst1.8 {d0[3]}, [r3], r1 ++ vst1.8 {d0[4]}, [r0], r1 ++ vst1.8 {d0[5]}, [r3], r1 ++ vst1.8 {d0[6]}, [r0], r1 ++ vst1.8 {d0[7]}, [r3], r1 ++ ble 0f ++ b 18b ++110: ++ add r4, r4, #8 ++ asr r1, r1, #1 ++11: ++ subs r4, r4, #1 ++ vld1.8 {d0[]}, [r2]! ++ vst1.8 {d0[0]}, [r0], r1 ++ bgt 11b ++0: ++ pop {r4,pc} ++ ++20: ++ add r3, r0, r1 ++ lsl r1, r1, #1 ++24: ++ subs r4, r4, #4 ++ blt 210f ++ vld1.16 {d0}, [r2, :64]! ++ vst1.16 {d0[0]}, [r0, :16], r1 ++ vst1.16 {d0[1]}, [r3, :16], r1 ++ vst1.16 {d0[2]}, [r0, :16], r1 ++ vst1.16 {d0[3]}, [r3, :16], r1 ++ ble 0f ++ b 24b ++210: ++ add r4, r4, #4 ++ asr r1, r1, #1 ++22: ++ subs r4, r4, #1 ++ vld1.16 {d0[]}, [r2]! ++ vst1.16 {d0[0]}, [r0], r1 ++ bgt 22b ++0: ++ pop {r4,pc} ++ ++30: ++ ldrh r3, [r2] ++ ldrb r12, [r2, #2] ++ add r2, r2, #3 ++ subs r4, r4, #1 ++ strh r3, [r0] ++ strb r12, [r0, #2] ++ add r0, r0, r1 ++ bgt 30b ++ pop {r4,pc} ++ ++40: ++ add r3, r0, r1 ++ lsl r1, r1, #1 ++42: ++ subs r4, r4, #2 ++ blt 41f ++ vld1.8 {d0}, [r2, :64]! ++ vst1.32 {d0[0]}, [r0, :32], r1 ++ vst1.32 {d0[1]}, [r3, :32], r1 ++ ble 0f ++ b 42b ++41: ++ vld1.32 {d0[]}, [r2] ++ vst1.32 {d0[0]}, [r0] ++0: ++ pop {r4,pc} ++ ++50: ++ ldr r3, [r2] ++ ldrb r12, [r2, #4] ++ add r2, r2, #5 ++ subs r4, r4, #1 ++ str r3, [r0] ++ strb r12, [r0, #4] ++ add r0, r0, r1 ++ bgt 50b ++ pop {r4,pc} ++ ++60: ++ ldr r3, [r2] ++ ldrh r12, [r2, #4] ++ add r2, r2, #6 ++ subs r4, r4, #1 ++ str r3, [r0] ++ strh r12, [r0, #4] ++ add r0, r0, r1 ++ bgt 60b ++ pop {r4,pc} ++ ++70: ++ ldr r3, [r2] ++ ldrh r12, [r2, #4] ++ ldrb lr, [r2, #6] ++ add r2, r2, #7 ++ subs r4, r4, #1 ++ str r3, [r0] ++ strh r12, [r0, #4] ++ strb lr, [r0, #6] ++ add r0, r0, r1 ++ bgt 70b ++ pop {r4,pc} ++endfunc +diff --git third_party/dav1d/src/arm/32/mc.S third_party/dav1d/src/arm/32/mc.S +index d29ea4ed847a..7c71f4376afa 100644 +--- third_party/dav1d/src/arm/32/mc.S ++++ third_party/dav1d/src/arm/32/mc.S +@@ -27,6 +27,7 @@ + */ + + #include "src/arm/asm.S" ++#include "src/arm/32/util.S" + + .macro avg dst0, dst1, t0, t1, t2, t3 + vld1.16 {\t0,\t1}, [r2, :128]! +@@ -212,3 +213,2119 @@ endfunc + bidir_fn avg + bidir_fn w_avg + bidir_fn mask ++ ++ ++// This has got the same signature as the put_8tap functions, ++// assumes that the caller has loaded the h argument into r5, ++// and assumes that r8 is set to (24-clz(w)). ++function put ++ adr r9, L(put_tbl) ++ ldr r8, [r9, r8, lsl #2] ++ add r9, r9, r8 ++ bx r9 ++ ++ .align 2 ++L(put_tbl): ++ .word 1280f - L(put_tbl) + CONFIG_THUMB ++ .word 640f - L(put_tbl) + CONFIG_THUMB ++ .word 32f - L(put_tbl) + CONFIG_THUMB ++ .word 160f - L(put_tbl) + CONFIG_THUMB ++ .word 8f - L(put_tbl) + CONFIG_THUMB ++ .word 4f - L(put_tbl) + CONFIG_THUMB ++ .word 2f - L(put_tbl) + CONFIG_THUMB ++ ++2: ++ vld1.16 {d0[]}, [r2], r3 ++ vld1.16 {d1[]}, [r2], r3 ++ subs r5, r5, #2 ++ vst1.16 {d0[0]}, [r0, :16], r1 ++ vst1.16 {d1[0]}, [r0, :16], r1 ++ bgt 2b ++ pop {r4-r11,pc} ++4: ++ vld1.32 {d0[]}, [r2], r3 ++ vld1.32 {d1[]}, [r2], r3 ++ subs r5, r5, #2 ++ vst1.32 {d0[0]}, [r0, :32], r1 ++ vst1.32 {d1[0]}, [r0, :32], r1 ++ bgt 4b ++ pop {r4-r11,pc} ++8: ++ vld1.8 {d0}, [r2], r3 ++ vld1.8 {d1}, [r2], r3 ++ subs r5, r5, #2 ++ vst1.8 {d0}, [r0, :64], r1 ++ vst1.8 {d1}, [r0, :64], r1 ++ bgt 8b ++ pop {r4-r11,pc} ++160: ++ add r8, r0, r1 ++ lsl r1, r1, #1 ++ add r9, r2, r3 ++ lsl r3, r3, #1 ++16: ++ vld1.8 {q0}, [r2], r3 ++ vld1.8 {q1}, [r9], r3 ++ subs r5, r5, #2 ++ vst1.8 {q0}, [r0, :128], r1 ++ vst1.8 {q1}, [r8, :128], r1 ++ bgt 16b ++ pop {r4-r11,pc} ++32: ++ vld1.8 {q0, q1}, [r2], r3 ++ subs r5, r5, #1 ++ vst1.8 {q0, q1}, [r0, :128], r1 ++ bgt 32b ++ pop {r4-r11,pc} ++640: ++ sub r1, r1, #32 ++ sub r3, r3, #32 ++64: ++ vld1.8 {q0, q1}, [r2]! ++ vst1.8 {q0, q1}, [r0, :128]! ++ vld1.8 {q2, q3}, [r2], r3 ++ subs r5, r5, #1 ++ vst1.8 {q2, q3}, [r0, :128], r1 ++ bgt 64b ++ pop {r4-r11,pc} ++1280: ++ sub r1, r1, #96 ++ sub r3, r3, #96 ++128: ++ vld1.8 {q8, q9}, [r2]! ++ vst1.8 {q8, q9}, [r0, :128]! ++ vld1.8 {q10, q11}, [r2]! ++ vst1.8 {q10, q11}, [r0, :128]! ++ vld1.8 {q12, q13}, [r2]! ++ vst1.8 {q12, q13}, [r0, :128]! ++ vld1.8 {q14, q15}, [r2], r3 ++ subs r5, r5, #1 ++ vst1.8 {q14, q15}, [r0, :128], r1 ++ bgt 128b ++ pop {r4-r11,pc} ++endfunc ++ ++ ++// This has got the same signature as the put_8tap functions, ++// assumes that the caller has loaded the h argument into r5, ++// and assumes that r8 is set to (24-clz(w)), and r7 to w*2. ++function prep ++ adr r9, L(prep_tbl) ++ ldr r8, [r9, r8, lsl #2] ++ add r9, r9, r8 ++ bx r9 ++ ++ .align 2 ++L(prep_tbl): ++ .word 1280f - L(prep_tbl) + CONFIG_THUMB ++ .word 640f - L(prep_tbl) + CONFIG_THUMB ++ .word 320f - L(prep_tbl) + CONFIG_THUMB ++ .word 160f - L(prep_tbl) + CONFIG_THUMB ++ .word 8f - L(prep_tbl) + CONFIG_THUMB ++ .word 4f - L(prep_tbl) + CONFIG_THUMB ++ ++4: ++ vld1.32 {d0[]}, [r1], r2 ++ vld1.32 {d2[]}, [r1], r2 ++ subs r4, r4, #2 ++ vshll.u8 q0, d0, #4 ++ vshll.u8 q1, d2, #4 ++ vst1.16 {d1, d2}, [r0, :64]! ++ bgt 4b ++ pop {r4-r11,pc} ++8: ++ vld1.8 {d0}, [r1], r2 ++ vld1.8 {d2}, [r1], r2 ++ subs r4, r4, #2 ++ vshll.u8 q0, d0, #4 ++ vshll.u8 q1, d2, #4 ++ vst1.16 {q0, q1}, [r0, :128]! ++ bgt 8b ++ pop {r4-r11,pc} ++160: ++ add r9, r1, r2 ++ lsl r2, r2, #1 ++ add r8, r0, r7 ++ lsl r7, r7, #1 ++16: ++ vld1.8 {q2}, [r1], r2 ++ vld1.8 {q3}, [r9], r2 ++ subs r4, r4, #2 ++ vshll.u8 q0, d4, #4 ++ vshll.u8 q1, d5, #4 ++ vshll.u8 q2, d6, #4 ++ vshll.u8 q3, d7, #4 ++ vst1.16 {q0, q1}, [r0, :128], r7 ++ vst1.16 {q2, q3}, [r8, :128], r7 ++ bgt 16b ++ pop {r4-r11,pc} ++320: ++ add r8, r0, r3 ++32: ++ vld1.8 {q0, q1}, [r1], r2 ++ subs r4, r4, #2 ++ vshll.u8 q8, d0, #4 ++ vshll.u8 q9, d1, #4 ++ vld1.8 {q2, q3}, [r1], r2 ++ vshll.u8 q10, d2, #4 ++ vshll.u8 q11, d3, #4 ++ vshll.u8 q12, d4, #4 ++ vst1.16 {q8, q9}, [r0, :128], r7 ++ vshll.u8 q13, d5, #4 ++ vst1.16 {q10, q11}, [r8, :128], r7 ++ vshll.u8 q14, d6, #4 ++ vst1.16 {q12, q13}, [r0, :128], r7 ++ vshll.u8 q15, d7, #4 ++ vst1.16 {q14, q15}, [r8, :128], r7 ++ bgt 32b ++ pop {r4-r11,pc} ++640: ++ sub r2, r2, #32 ++ add r8, r0, #32 ++ mov r6, #64 ++64: ++ vld1.8 {q0, q1}, [r1]! ++ subs r4, r4, #1 ++ vshll.u8 q8, d0, #4 ++ vshll.u8 q9, d1, #4 ++ vld1.8 {q2, q3}, [r1], r2 ++ vshll.u8 q10, d2, #4 ++ vshll.u8 q11, d3, #4 ++ vshll.u8 q12, d4, #4 ++ vst1.16 {q8, q9}, [r0, :128], r6 ++ vshll.u8 q13, d5, #4 ++ vshll.u8 q14, d6, #4 ++ vst1.16 {q10, q11}, [r8, :128], r6 ++ vshll.u8 q15, d7, #4 ++ vst1.16 {q12, q13}, [r0, :128], r6 ++ vst1.16 {q14, q15}, [r8, :128], r6 ++ bgt 64b ++ pop {r4-r11,pc} ++1280: ++ sub r2, r2, #96 ++ add r8, r0, #32 ++ mov r6, #64 ++128: ++ vld1.8 {q0, q1}, [r1]! ++ vld1.8 {q2, q3}, [r1]! ++ vshll.u8 q10, d0, #4 ++ vshll.u8 q11, d1, #4 ++ vshll.u8 q12, d2, #4 ++ vshll.u8 q13, d3, #4 ++ vshll.u8 q14, d4, #4 ++ vshll.u8 q15, d5, #4 ++ vld1.8 {q8, q9}, [r1]! ++ vst1.16 {q10, q11}, [r0, :128], r6 ++ vst1.16 {q12, q13}, [r8, :128], r6 ++ vshll.u8 q0, d6, #4 ++ vshll.u8 q1, d7, #4 ++ vshll.u8 q2, d16, #4 ++ vshll.u8 q3, d17, #4 ++ vshll.u8 q8, d18, #4 ++ vshll.u8 q9, d19, #4 ++ vld1.8 {q10, q11}, [r1], r2 ++ vst1.16 {q14, q15}, [r0, :128], r6 ++ vst1.16 {q0, q1}, [r8, :128], r6 ++ vshll.u8 q12, d20, #4 ++ vshll.u8 q13, d21, #4 ++ vshll.u8 q14, d22, #4 ++ vshll.u8 q15, d23, #4 ++ subs r4, r4, #1 ++ vst1.16 {q2, q3}, [r0, :128], r6 ++ vst1.16 {q8, q9}, [r8, :128], r6 ++ vst1.16 {q12, q13}, [r0, :128], r6 ++ vst1.16 {q14, q15}, [r8, :128], r6 ++ bgt 128b ++ pop {r4-r11,pc} ++endfunc ++ ++ ++.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ++ vld1.\wd {\d0[]}, [\s0], \strd ++ vld1.\wd {\d1[]}, [\s1], \strd ++.ifnb \d2 ++ vld1.\wd {\d2[]}, [\s0], \strd ++ vld1.\wd {\d3[]}, [\s1], \strd ++.endif ++.ifnb \d4 ++ vld1.\wd {\d4[]}, [\s0], \strd ++.endif ++.ifnb \d5 ++ vld1.\wd {\d5[]}, [\s1], \strd ++.endif ++.ifnb \d6 ++ vld1.\wd {\d6[]}, [\s0], \strd ++.endif ++.endm ++.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 ++ vld1.8 {\d0}, [\s0], \strd ++ vld1.8 {\d1}, [\s1], \strd ++.ifnb \d2 ++ vld1.8 {\d2}, [\s0], \strd ++ vld1.8 {\d3}, [\s1], \strd ++.endif ++.ifnb \d4 ++ vld1.8 {\d4}, [\s0], \strd ++.endif ++.ifnb \d5 ++ vld1.8 {\d5}, [\s1], \strd ++.endif ++.ifnb \d6 ++ vld1.8 {\d6}, [\s0], \strd ++.endif ++.endm ++.macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 ++ load_slice \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6 ++.endm ++.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 ++ load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 ++.endm ++.macro interleave_1_16 r0, r1, r2, r3, r4 ++ vext.8 \r0, \r0, \r1, #6 ++ vext.8 \r1, \r1, \r2, #6 ++.ifnb \r3 ++ vext.8 \r2, \r2, \r3, #6 ++ vext.8 \r3, \r3, \r4, #6 ++.endif ++.endm ++.macro interleave_1_32 r0, r1, r2, r3, r4 ++ vext.8 \r0, \r0, \r1, #4 ++ vext.8 \r1, \r1, \r2, #4 ++.ifnb \r3 ++ vext.8 \r2, \r2, \r3, #4 ++ vext.8 \r3, \r3, \r4, #4 ++.endif ++.endm ++.macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6 ++ vmovl.u8 \q0, \d0 ++ vmovl.u8 \q1, \d1 ++.ifnb \q2 ++ vmovl.u8 \q2, \d2 ++ vmovl.u8 \q3, \d3 ++.endif ++.ifnb \q4 ++ vmovl.u8 \q4, \d4 ++.endif ++.ifnb \q5 ++ vmovl.u8 \q5, \d5 ++.endif ++.ifnb \q6 ++ vmovl.u8 \q6, \d6 ++.endif ++.endm ++.macro mul_mla_4 d, s0, s1, s2, s3 ++ vmul.s16 \d, \s0, d0[0] ++ vmla.s16 \d, \s1, d0[1] ++ vmla.s16 \d, \s2, d0[2] ++ vmla.s16 \d, \s3, d0[3] ++.endm ++.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 ++ vmul.s16 \d0, \s0, d0[0] ++ vmla.s16 \d0, \s1, d0[1] ++ vmla.s16 \d0, \s2, d0[2] ++ vmla.s16 \d0, \s3, d0[3] ++ vmla.s16 \d0, \s4, d1[0] ++ vmla.s16 \d0, \s5, d1[1] ++ vmla.s16 \d0, \s6, d1[2] ++ vmla.s16 \d0, \s7, d1[3] ++ vmul.s16 \d1, \s1, d0[0] ++ vmla.s16 \d1, \s2, d0[1] ++ vmla.s16 \d1, \s3, d0[2] ++ vmla.s16 \d1, \s4, d0[3] ++ vmla.s16 \d1, \s5, d1[0] ++ vmla.s16 \d1, \s6, d1[1] ++ vmla.s16 \d1, \s7, d1[2] ++ vmla.s16 \d1, \s8, d1[3] ++.endm ++.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 ++ vmul.s16 \d0, \s0, d0[0] ++ vmla.s16 \d0, \s1, d0[1] ++ vmla.s16 \d0, \s2, d0[2] ++ vmla.s16 \d0, \s3, d0[3] ++ vmla.s16 \d0, \s4, d1[0] ++ vmla.s16 \d0, \s5, d1[1] ++ vmla.s16 \d0, \s6, d1[2] ++ vmla.s16 \d0, \s7, d1[3] ++ vmul.s16 \d1, \s2, d0[0] ++ vmla.s16 \d1, \s3, d0[1] ++ vmla.s16 \d1, \s4, d0[2] ++ vmla.s16 \d1, \s5, d0[3] ++ vmla.s16 \d1, \s6, d1[0] ++ vmla.s16 \d1, \s7, d1[1] ++ vmla.s16 \d1, \s8, d1[2] ++ vmla.s16 \d1, \s9, d1[3] ++.endm ++.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 ++ vmul.s16 \d0, \s0, d0[0] ++ vmla.s16 \d0, \s1, d0[1] ++ vmla.s16 \d0, \s2, d0[2] ++ vmla.s16 \d0, \s3, d0[3] ++ vmla.s16 \d0, \s4, d1[0] ++ vmla.s16 \d0, \s5, d1[1] ++ vmla.s16 \d0, \s6, d1[2] ++ vmla.s16 \d0, \s7, d1[3] ++ vmul.s16 \d1, \s4, d0[0] ++ vmla.s16 \d1, \s5, d0[1] ++ vmla.s16 \d1, \s6, d0[2] ++ vmla.s16 \d1, \s7, d0[3] ++ vmla.s16 \d1, \s8, d1[0] ++ vmla.s16 \d1, \s9, d1[1] ++ vmla.s16 \d1, \s10, d1[2] ++ vmla.s16 \d1, \s11, d1[3] ++.endm ++.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3 ++ vqrshrun.s16 \d0, \q0, #\shift ++.ifnb \q1 ++ vqrshrun.s16 \d1, \q1, #\shift ++.endif ++.ifnb \q2 ++ vqrshrun.s16 \d2, \q2, #\shift ++ vqrshrun.s16 \d3, \q3, #\shift ++.endif ++.endm ++.macro vrshr_s16 shift, r0, r1, r2, r3 ++ vrshr.s16 \r0, \r0, #\shift ++.ifnb \r1 ++ vrshr.s16 \r1, \r1, #\shift ++.endif ++.ifnb \r2 ++ vrshr.s16 \r2, \r2, #\shift ++ vrshr.s16 \r3, \r3, #\shift ++.endif ++.endm ++.macro st_16 strd, reg, lanes ++ vst1.16 {\reg[0]}, [r0, :16], \strd ++ vst1.16 {\reg[1]}, [r8, :16], \strd ++.if \lanes > 2 ++ vst1.16 {\reg[2]}, [r0, :16], \strd ++ vst1.16 {\reg[3]}, [r8, :16], \strd ++.endif ++.endm ++.macro st_32 strd, r0, r1 ++ vst1.32 {\r0[0]}, [r0, :32], \strd ++ vst1.32 {\r0[1]}, [r8, :32], \strd ++.ifnb \r1 ++ vst1.32 {\r1[0]}, [r0, :32], \strd ++ vst1.32 {\r1[1]}, [r8, :32], \strd ++.endif ++.endm ++.macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 ++ vst1.8 {\r0}, [r0, \align], \strd ++ vst1.8 {\r1}, [r8, \align], \strd ++.ifnb \r2 ++ vst1.8 {\r2}, [r0, \align], \strd ++ vst1.8 {\r3}, [r8, \align], \strd ++.endif ++.ifnb \r4 ++ vst1.8 {\r4}, [r0, \align], \strd ++ vst1.8 {\r5}, [r8, \align], \strd ++ vst1.8 {\r6}, [r0, \align], \strd ++ vst1.8 {\r7}, [r8, \align], \strd ++.endif ++.endm ++.macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3 ++.ifc \type, put ++ vqrshrun_s16 6, \q0, \d0, \q1, \d2 ++ st_32 \strd, \d0, \d2 ++.else ++ vrshr_s16 2, \q0, \q1 ++ st_reg \strd, :64, \d0, \d1, \d2, \d3 ++.endif ++.endm ++.macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3 ++.ifc \type, put ++ vqrshrun_s16 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 ++ st_reg \strd, :64, \d0, \d1, \d2, \d3 ++.else ++ vrshr_s16 2, \q0, \q1, \q2, \q3 ++ st_reg \strd, :128,\q0, \q1, \q2, \q3 ++.endif ++.endm ++.macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3 ++.ifc \type, put ++ vqrshrun.s16 \d0, \q0, #6 ++ vqrshrun.s16 \d1, \q1, #6 ++ vqrshrun.s16 \d4, \q2, #6 ++ vqrshrun.s16 \d5, \q3, #6 ++ st_reg \strd, :128, \q0, \q2 ++.else ++ vrshr_s16 2, \q0, \q1, \q2, \q3 ++ vst1.16 {\q0, \q1}, [r0, :128], \strd ++ vst1.16 {\q2, \q3}, [r8, :128], \strd ++.endif ++.endm ++ ++.macro make_8tap_fn op, type, type_h, type_v ++function \op\()_8tap_\type\()_8bpc_neon, export=1 ++ push {r4-r11,lr} ++ movw r8, \type_h ++ movw r9, \type_v ++ b \op\()_8tap ++endfunc ++.endm ++ ++// No spaces in these expressions, due to gas-preprocessor. ++#define REGULAR ((0*15<<7)|3*15) ++#define SMOOTH ((1*15<<7)|4*15) ++#define SHARP ((2*15<<7)|3*15) ++ ++.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv ++make_8tap_fn \type, regular, REGULAR, REGULAR ++make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH ++make_8tap_fn \type, regular_sharp, REGULAR, SHARP ++make_8tap_fn \type, smooth, SMOOTH, SMOOTH ++make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR ++make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP ++make_8tap_fn \type, sharp, SHARP, SHARP ++make_8tap_fn \type, sharp_regular, SHARP, REGULAR ++make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH ++ ++function \type\()_8tap ++ ldrd r4, r5, [sp, #36] ++ ldrd r6, r7, [sp, #44] ++ movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) ++ mul \mx, \mx, r10 ++ mul \my, \my, r10 ++ add \mx, \mx, r8 // mx, 8tap_h, 4tap_h ++ add \my, \my, r9 // my, 8tap_v, 4tap_v ++.ifc \type, prep ++ lsl \d_strd, \w, #1 ++.endif ++ ++ clz r8, \w ++ tst \mx, #(0x7f << 14) ++ sub r8, r8, #24 ++ movrel r10, X(mc_subpel_filters), -8 ++ bne L(\type\()_8tap_h) ++ tst \my, #(0x7f << 14) ++ bne L(\type\()_8tap_v) ++ b \type ++ ++L(\type\()_8tap_h): ++ cmp \w, #4 ++ ubfx r9, \mx, #7, #7 ++ and \mx, \mx, #0x7f ++ it gt ++ movgt \mx, r9 ++ tst \my, #(0x7f << 14) ++ add \mx, r10, \mx, lsl #3 ++ bne L(\type\()_8tap_hv) ++ ++ adr r9, L(\type\()_8tap_h_tbl) ++ ldr r8, [r9, r8, lsl #2] ++ add r9, r9, r8 ++ bx r9 ++ ++ .align 2 ++L(\type\()_8tap_h_tbl): ++ .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB ++ .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB ++ .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB ++ .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB ++ .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB ++ .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB ++ .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB ++ ++20: // 2xN h ++.ifc \type, put ++ add \mx, \mx, #2 ++ vld1.32 {d0[]}, [\mx] ++ sub \src, \src, #1 ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \d_strd, \d_strd, #1 ++ lsl \s_strd, \s_strd, #1 ++ vmovl.s8 q0, d0 ++2: ++ vld1.8 {d4}, [\src], \s_strd ++ vld1.8 {d6}, [\sr2], \s_strd ++ vmovl.u8 q2, d4 ++ vmovl.u8 q3, d6 ++ vext.8 d5, d4, d5, #2 ++ vext.8 d7, d6, d7, #2 ++ subs \h, \h, #2 ++ vtrn.32 d4, d6 ++ vtrn.32 d5, d7 ++ vmul.s16 d2, d4, d0[0] ++ vmla.s16 d2, d5, d0[1] ++ vmla.s16 d2, d6, d0[2] ++ vmla.s16 d2, d7, d0[3] ++ vrshr.s16 d2, d2, #2 ++ vqrshrun.s16 d2, q1, #4 ++ vst1.16 {d2[0]}, [\dst, :16], \d_strd ++ vst1.16 {d2[1]}, [\ds2, :16], \d_strd ++ bgt 2b ++ pop {r4-r11,pc} ++.endif ++ ++40: // 4xN h ++ add \mx, \mx, #2 ++ vld1.32 {d0[]}, [\mx] ++ sub \src, \src, #1 ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \d_strd, \d_strd, #1 ++ lsl \s_strd, \s_strd, #1 ++ vmovl.s8 q0, d0 ++4: ++ vld1.8 {d16}, [\src], \s_strd ++ vld1.8 {d24}, [\sr2], \s_strd ++ vmovl.u8 q8, d16 ++ vmovl.u8 q12, d24 ++ vext.8 q9, q8, q8, #2 ++ vext.8 q10, q8, q8, #4 ++ vext.8 q11, q8, q8, #6 ++ vext.8 q13, q12, q12, #2 ++ vext.8 q14, q12, q12, #4 ++ vext.8 q15, q12, q12, #6 ++ subs \h, \h, #2 ++ vmul.s16 d4, d16, d0[0] ++ vmla.s16 d4, d18, d0[1] ++ vmla.s16 d4, d20, d0[2] ++ vmla.s16 d4, d22, d0[3] ++ vmul.s16 d5, d24, d0[0] ++ vmla.s16 d5, d26, d0[1] ++ vmla.s16 d5, d28, d0[2] ++ vmla.s16 d5, d30, d0[3] ++ vrshr.s16 q2, q2, #2 ++.ifc \type, put ++ vqrshrun.s16 d4, q2, #4 ++ vst1.32 {d4[0]}, [\dst, :32], \d_strd ++ vst1.32 {d4[1]}, [\ds2, :32], \d_strd ++.else ++ vst1.16 {d4}, [\dst, :64], \d_strd ++ vst1.16 {d5}, [\ds2, :64], \d_strd ++.endif ++ bgt 4b ++ pop {r4-r11,pc} ++ ++80: // 8xN h ++ vld1.8 {d0}, [\mx] ++ sub \src, \src, #3 ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \d_strd, \d_strd, #1 ++ lsl \s_strd, \s_strd, #1 ++ vmovl.s8 q0, d0 ++8: ++ vld1.8 {q8}, [\src], \s_strd ++ vld1.8 {q12}, [\sr2], \s_strd ++ vmovl.u8 q9, d17 ++ vmovl.u8 q8, d16 ++ vmovl.u8 q13, d25 ++ vmovl.u8 q12, d24 ++ ++ vmul.s16 q10, q8, d0[0] ++ vmul.s16 q14, q12, d0[0] ++.irpc i, 1234567 ++ vext.8 q11, q8, q9, #(2*\i) ++ vext.8 q15, q12, q13, #(2*\i) ++.if \i < 4 ++ vmla.s16 q10, q11, d0[\i] ++ vmla.s16 q14, q15, d0[\i] ++.else ++ vmla.s16 q10, q11, d1[\i-4] ++ vmla.s16 q14, q15, d1[\i-4] ++.endif ++.endr ++ subs \h, \h, #2 ++ vrshr.s16 q10, q10, #2 ++ vrshr.s16 q14, q14, #2 ++.ifc \type, put ++ vqrshrun.s16 d20, q10, #4 ++ vqrshrun.s16 d28, q14, #4 ++ vst1.8 {d20}, [\dst, :64], \d_strd ++ vst1.8 {d28}, [\ds2, :64], \d_strd ++.else ++ vst1.16 {q10}, [\dst, :128], \d_strd ++ vst1.16 {q14}, [\ds2, :128], \d_strd ++.endif ++ bgt 8b ++ pop {r4-r11,pc} ++ ++160: ++320: ++640: ++1280: // 16xN, 32xN, ... h ++ // This could be done without touching q4-q6, by using only ++ // one temporary for vext in the loop. That's slower on A7 and A53, ++ // (but surprisingly, marginally faster on A8 and A73). ++ vpush {q4-q6} ++ vld1.8 {d0}, [\mx] ++ sub \src, \src, #3 ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ vmovl.s8 q0, d0 ++ ++ sub \s_strd, \s_strd, \w ++ sub \s_strd, \s_strd, #8 ++.ifc \type, put ++ lsl \d_strd, \d_strd, #1 ++ sub \d_strd, \d_strd, \w ++.endif ++161: ++ vld1.8 {d16, d17, d18}, [\src]! ++ vld1.8 {d24, d25, d26}, [\sr2]! ++ mov \mx, \w ++ vmovl.u8 q10, d18 ++ vmovl.u8 q9, d17 ++ vmovl.u8 q8, d16 ++ vmovl.u8 q14, d26 ++ vmovl.u8 q13, d25 ++ vmovl.u8 q12, d24 ++ ++16: ++ vmul.s16 q1, q8, d0[0] ++ vmul.s16 q2, q9, d0[0] ++ vmul.s16 q3, q12, d0[0] ++ vmul.s16 q4, q13, d0[0] ++.irpc i, 1234567 ++ vext.8 q5, q8, q9, #(2*\i) ++ vext.8 q6, q9, q10, #(2*\i) ++ vext.8 q11, q12, q13, #(2*\i) ++ vext.8 q15, q13, q14, #(2*\i) ++.if \i < 4 ++ vmla.s16 q1, q5, d0[\i] ++ vmla.s16 q2, q6, d0[\i] ++ vmla.s16 q3, q11, d0[\i] ++ vmla.s16 q4, q15, d0[\i] ++.else ++ vmla.s16 q1, q5, d1[\i-4] ++ vmla.s16 q2, q6, d1[\i-4] ++ vmla.s16 q3, q11, d1[\i-4] ++ vmla.s16 q4, q15, d1[\i-4] ++.endif ++.endr ++ vrshr.s16 q1, q1, #2 ++ vrshr.s16 q2, q2, #2 ++ vrshr.s16 q3, q3, #2 ++ vrshr.s16 q4, q4, #2 ++ subs \mx, \mx, #16 ++.ifc \type, put ++ vqrshrun.s16 d2, q1, #4 ++ vqrshrun.s16 d3, q2, #4 ++ vqrshrun.s16 d4, q3, #4 ++ vqrshrun.s16 d5, q4, #4 ++ vst1.8 {q1}, [\dst, :128]! ++ vst1.8 {q2}, [\ds2, :128]! ++.else ++ vst1.16 {q1, q2}, [\dst, :128]! ++ vst1.16 {q3, q4}, [\ds2, :128]! ++.endif ++ ble 9f ++ ++ vmov q8, q10 ++ vmov q12, q14 ++ vld1.8 {d18, d19}, [\src]! ++ vld1.8 {d26, d27}, [\sr2]! ++ vmovl.u8 q10, d19 ++ vmovl.u8 q9, d18 ++ vmovl.u8 q14, d27 ++ vmovl.u8 q13, d26 ++ b 16b ++ ++9: ++ add \dst, \dst, \d_strd ++ add \ds2, \ds2, \d_strd ++ add \src, \src, \s_strd ++ add \sr2, \sr2, \s_strd ++ ++ subs \h, \h, #2 ++ bgt 161b ++ vpop {q4-q6} ++ pop {r4-r11,pc} ++ ++L(\type\()_8tap_v): ++ cmp \h, #4 ++ ubfx r9, \my, #7, #7 ++ and \my, \my, #0x7f ++ it gt ++ movgt \my, r9 ++ add \my, r10, \my, lsl #3 ++ ++ adr r9, L(\type\()_8tap_v_tbl) ++ ldr r8, [r9, r8, lsl #2] ++ add r9, r9, r8 ++ bx r9 ++ ++ .align 2 ++L(\type\()_8tap_v_tbl): ++ .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB ++ .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB ++ .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB ++ .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB ++ .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB ++ .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB ++ .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB ++ ++20: // 2xN v ++.ifc \type, put ++ bgt 28f ++ ++ cmp \h, #2 ++ add \my, \my, #2 ++ vld1.32 {d0[]}, [\my] ++ sub \src, \src, \s_strd ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vmovl.s8 q0, d0 ++ ++ // 2x2 v ++ load_16 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 ++ interleave_1_16 d1, d2, d3, d4, d5 ++ bgt 24f ++ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 ++ mul_mla_4 d6, d16, d18, d20, d22 ++ vqrshrun_s16 6, q3, d6 ++ st_16 \d_strd, d6, 2 ++ pop {r4-r11,pc} ++ ++24: // 2x4 v ++ load_16 \sr2, \src, \s_strd, d6, d7 ++ interleave_1_16 d5, d6, d7 ++ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6 ++ vmov d17, d20 ++ vmov d19, d22 ++ vmov d21, d24 ++ vmov d23, d26 ++ mul_mla_4 q3, q8, q9, q10, q11 ++ vqrshrun_s16 6, q3, d6 ++ st_16 \d_strd, d6, 4 ++ pop {r4-r11,pc} ++ ++28: // 2x8, 2x16 v ++ vpush {q4-q7} ++ vld1.8 {d0}, [\my] ++ sub \sr2, \src, \s_strd, lsl #1 ++ add \ds2, \dst, \d_strd ++ sub \src, \sr2, \s_strd ++ lsl \d_strd, \d_strd, #1 ++ lsl \s_strd, \s_strd, #1 ++ vmovl.s8 q0, d0 ++ ++ load_16 \src, \sr2, \s_strd, d2, d4, d6, d8, d10, d12, d14 ++ interleave_1_16 d2, d4, d6, d8, d10 ++ interleave_1_16 d10, d12, d14 ++ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q5, d10, q6, d12 ++ vmov d3, d6 ++ vmov d5, d8 ++ vmov d7, d10 ++ vmov d9, d12 ++216: ++ subs \h, \h, #8 ++ load_16 \sr2, \src, \s_strd, d16, d18, d20, d22 ++ load_16 \sr2, \src, \s_strd, d24, d26, d28, d30 ++ interleave_1_16 d14, d16, d18, d20, d22 ++ interleave_1_16 d22, d24, d26, d28, d30 ++ vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20 ++ vmovl_u8 q11, d22, q12, d24, q13, d26, q14, d28 ++ vmov d11, d14 ++ vmov d13, d16 ++ vmov d15, d18 ++ vmov d17, d20 ++ vmov d19, d22 ++ vmov d21, d24 ++ vmov d23, d26 ++ vmov d25, d28 ++ mul_mla_8_4 q1, q2, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12 ++ vqrshrun_s16 6, q1, d2, q2, d4 ++ st_16 \d_strd, d2, 4 ++ st_16 \d_strd, d4, 4 ++ ble 0f ++ vmov q1, q9 ++ vmov q2, q10 ++ vmov q3, q11 ++ vmov q4, q12 ++ vmov q5, q13 ++ vmov q6, q14 ++ vmov d14, d30 ++ b 216b ++0: ++ vpop {q4-q7} ++ pop {r4-r11,pc} ++.endif ++ ++40: ++ bgt 480f ++ ++ // 4x2, 4x4 v ++ cmp \h, #2 ++ add \my, \my, #2 ++ vld1.32 {d0[]}, [\my] ++ sub \src, \src, \s_strd ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vmovl.s8 q0, d0 ++ ++ load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 ++ interleave_1_32 d1, d2, d3, d4, d5 ++ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 ++ mul_mla_4 q3, q8, q9, q10, q11 ++ shift_store_4 \type, \d_strd, q3, d6, d7 ++ ble 0f ++ load_32 \sr2, \src, \s_strd, d6, d7 ++ interleave_1_32 d5, d6, d7 ++ vmovl_u8 q12, d5, q13, d6 ++ mul_mla_4 q3, q10, q11, q12, q13 ++ shift_store_4 \type, \d_strd, q3, d6, d7 ++0: ++ pop {r4-r11,pc} ++ ++480: // 4x8, 4x16 v ++ vpush {q4} ++ vld1.8 {d0}, [\my] ++ sub \sr2, \src, \s_strd, lsl #1 ++ add \ds2, \dst, \d_strd ++ sub \src, \sr2, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vmovl.s8 q0, d0 ++ ++ load_32 \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 ++ interleave_1_32 d2, d4, d6 ++ interleave_1_32 d6, d8, d16, d18, d20 ++ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18 ++ ++48: ++ subs \h, \h, #4 ++ load_32 \sr2, \src, \s_strd, d22, d24, d26, d28 ++ interleave_1_32 d20, d22, d24, d26, d28 ++ vmovl_u8 q10, d20, q11, d22, q12, d24, q13, d26 ++ mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13 ++ shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5 ++ ble 0f ++ subs \h, \h, #4 ++ load_32 \sr2, \src, \s_strd, d30, d2, d4, d6 ++ interleave_1_32 d28, d30, d2, d4, d6 ++ vmovl_u8 q14, d28, q15, d30, q1, d2, q2, d4 ++ mul_mla_8_2 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1, q2 ++ shift_store_4 \type, \d_strd, q8, d16, d17, q9, d18, d19 ++ ble 0f ++ subs \h, \h, #4 ++ load_32 \sr2, \src, \s_strd, d8, d16, d18, d20 ++ interleave_1_32 d6, d8, d16, d18, d20 ++ vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18 ++ mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9 ++ shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27 ++ b 48b ++0: ++ vpop {q4} ++ pop {r4-r11,pc} ++ ++80: ++ bgt 880f ++ ++ // 8x2, 8x4 v ++ cmp \h, #2 ++ add \my, \my, #2 ++ vld1.32 {d0[]}, [\my] ++ sub \src, \src, \s_strd ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vmovl.s8 q0, d0 ++ ++ load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 ++ vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5 ++ mul_mla_4 q1, q8, q9, q10, q11 ++ mul_mla_4 q2, q9, q10, q11, q12 ++ shift_store_8 \type, \d_strd, q1, d2, q2, d4 ++ ble 0f ++ load_reg \sr2, \src, \s_strd, d6, d7 ++ vmovl_u8 q13, d6, q14, d7 ++ mul_mla_4 q1, q10, q11, q12, q13 ++ mul_mla_4 q2, q11, q12, q13, q14 ++ shift_store_8 \type, \d_strd, q1, d2, q2, d4 ++0: ++ pop {r4-r11,pc} ++ ++880: // 8x8, 8x16, 8x32 v ++1680: // 16x8, 16x16, ... ++320: // 32x8, 32x16, ... ++640: ++1280: ++ vpush {q4} ++ vld1.8 {d0}, [\my] ++ sub \src, \src, \s_strd ++ sub \src, \src, \s_strd, lsl #1 ++ vmovl.s8 q0, d0 ++ mov \my, \h ++168: ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ ++ load_reg \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 ++ vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18, q10, d20 ++ ++88: ++ subs \h, \h, #2 ++ load_reg \sr2, \src, \s_strd, d22, d24 ++ vmovl_u8 q11, d22, q12, d24 ++ mul_mla_8_1 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12 ++ shift_store_8 \type, \d_strd, q1, d2, q2, d4 ++ ble 9f ++ subs \h, \h, #2 ++ load_reg \sr2, \src, \s_strd, d26, d28 ++ vmovl_u8 q13, d26, q14, d28 ++ mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14 ++ shift_store_8 \type, \d_strd, q3, d6, q4, d8 ++ ble 9f ++ subs \h, \h, #4 ++ load_reg \sr2, \src, \s_strd, d30, d2, d4, d6 ++ vmovl_u8 q15, d30, q1, d2, q2, d4, q3, d6 ++ mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1 ++ mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3 ++ shift_store_8 \type, \d_strd, q8, d16, q9, d18, q10, d20, q11, d22 ++ ble 9f ++ subs \h, \h, #4 ++ load_reg \sr2, \src, \s_strd, d8, d16, d18, d20 ++ vmovl_u8 q4, d8, q8, d16, q9, d18, q10, d20 ++ mul_mla_8_1 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8 ++ mul_mla_8_1 q14, q15, q14, q15, q1, q2, q3, q4, q8, q9, q10 ++ shift_store_8 \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30 ++ bgt 88b ++9: ++ subs \w, \w, #8 ++ ble 0f ++ asr \s_strd, \s_strd, #1 ++ asr \d_strd, \d_strd, #1 ++ mls \src, \s_strd, \my, \src ++ mls \dst, \d_strd, \my, \dst ++ sub \src, \src, \s_strd, lsl #3 ++ mov \h, \my ++ add \src, \src, #8 ++.ifc \type, put ++ add \dst, \dst, #8 ++.else ++ add \dst, \dst, #16 ++.endif ++ b 168b ++0: ++ vpop {q4} ++ pop {r4-r11,pc} ++ ++160: ++ bgt 1680b ++ ++ // 16x2, 16x4 v ++ add \my, \my, #2 ++ vld1.32 {d0[]}, [\my] ++ sub \src, \src, \s_strd ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vmovl.s8 q0, d0 ++ ++ cmp \h, #2 ++ load_reg \src, \sr2, \s_strd, q11, q12, q13, q14, q15 ++ vmovl.u8 q1, d22 ++ vmovl.u8 q2, d24 ++ vmovl.u8 q3, d26 ++ vmovl.u8 q8, d28 ++ vmovl.u8 q9, d30 ++ vmovl.u8 q11, d23 ++ vmovl.u8 q12, d25 ++ vmovl.u8 q13, d27 ++ vmovl.u8 q14, d29 ++ vmovl.u8 q15, d31 ++ mul_mla_4 q1, q1, q2, q3, q8 ++ mul_mla_4 q10, q2, q3, q8, q9 ++ mul_mla_4 q2, q11, q12, q13, q14 ++ mul_mla_4 q11, q12, q13, q14, q15 ++ shift_store_16 \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11 ++ ble 0f ++ load_reg \sr2, \src, \s_strd, q10, q11 ++ vmovl.u8 q1, d20 ++ vmovl.u8 q10, d21 ++ vmovl.u8 q12, d22 ++ vmovl.u8 q11, d23 ++ mul_mla_4 q2, q3, q8, q9, q1 ++ mul_mla_4 q3, q13, q14, q15, q10 ++ mul_mla_4 q13, q8, q9, q1, q12 ++ mul_mla_4 q14, q14, q15, q10, q11 ++ shift_store_16 \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14 ++0: ++ pop {r4-r11,pc} ++ ++L(\type\()_8tap_hv): ++ cmp \h, #4 ++ ubfx r9, \my, #7, #7 ++ and \my, \my, #0x7f ++ it gt ++ movgt \my, r9 ++ add \my, r10, \my, lsl #3 ++ ++ adr r9, L(\type\()_8tap_hv_tbl) ++ ldr r8, [r9, r8, lsl #2] ++ add r9, r9, r8 ++ bx r9 ++ ++ .align 2 ++L(\type\()_8tap_hv_tbl): ++ .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB ++ .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB ++ .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB ++ .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB ++ .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB ++ .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB ++ .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB ++ ++20: ++.ifc \type, put ++ add \mx, \mx, #2 ++ vld1.32 {d0[]}, [\mx] ++ bgt 280f ++ add \my, \my, #2 ++ vld1.32 {d2[]}, [\my] ++ ++ // 2x2, 2x4 hv ++ sub \sr2, \src, #1 ++ sub \src, \sr2, \s_strd ++ add \ds2, \dst, \d_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vmovl.s8 q0, d0 ++ vmovl.s8 q1, d2 ++ ++ ++ vld1.8 {d26}, [\src], \s_strd ++ vmovl.u8 q13, d26 ++ vext.8 q14, q13, q13, #2 ++ vmul.s16 d26, d26, d0 ++ vmul.s16 d28, d28, d0 ++ vpadd.s16 d26, d26, d28 ++ vpadd.s16 d26, d26, d26 ++ vrshr.s16 d16, d26, #2 ++ bl L(\type\()_8tap_filter_2) ++ ++ vext.8 d16, d16, d16, #4 ++ vmov d17, d26 ++ vext.8 d16, d16, d26, #4 ++ ++2: ++ bl L(\type\()_8tap_filter_2) ++ ++ vext.8 d18, d17, d26, #4 ++ vmov d19, d26 ++ vmull.s16 q2, d16, d2[0] ++ vmlal.s16 q2, d17, d2[1] ++ vmlal.s16 q2, d18, d2[2] ++ vmlal.s16 q2, d19, d2[3] ++ ++ vqrshrn.s32 d4, q2, #\shift_hv ++ vqmovun.s16 d4, q2 ++ subs \h, \h, #2 ++ vst1.16 {d4[0]}, [\dst, :16], \d_strd ++ vst1.16 {d4[1]}, [\ds2, :16], \d_strd ++ ble 0f ++ vmov d16, d18 ++ vmov d17, d19 ++ b 2b ++ ++280: // 2x8, 2x16, 2x32 hv ++ vld1.8 {d2}, [\my] ++ sub \src, \src, #1 ++ sub \sr2, \src, \s_strd, lsl #1 ++ sub \src, \sr2, \s_strd ++ add \ds2, \dst, \d_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vmovl.s8 q0, d0 ++ vmovl.s8 q1, d2 ++ ++ vld1.8 {d26}, [\src], \s_strd ++ vmovl.u8 q13, d26 ++ vext.8 q14, q13, q13, #2 ++ vmul.s16 d26, d26, d0 ++ vmul.s16 d28, d28, d0 ++ vpadd.s16 d26, d26, d28 ++ vpadd.s16 d26, d26, d26 ++ vrshr.s16 d16, d26, #2 ++ ++ bl L(\type\()_8tap_filter_2) ++ vext.8 d16, d16, d16, #4 ++ vmov d17, d26 ++ vext.8 d16, d16, d26, #4 ++ bl L(\type\()_8tap_filter_2) ++ vext.8 d18, d17, d26, #4 ++ vmov d19, d26 ++ bl L(\type\()_8tap_filter_2) ++ vext.8 d20, d19, d26, #4 ++ vmov d21, d26 ++ ++28: ++ bl L(\type\()_8tap_filter_2) ++ vext.8 d22, d21, d26, #4 ++ vmov d23, d26 ++ vmull.s16 q2, d16, d2[0] ++ vmlal.s16 q2, d17, d2[1] ++ vmlal.s16 q2, d18, d2[2] ++ vmlal.s16 q2, d19, d2[3] ++ vmlal.s16 q2, d20, d3[0] ++ vmlal.s16 q2, d21, d3[1] ++ vmlal.s16 q2, d22, d3[2] ++ vmlal.s16 q2, d23, d3[3] ++ ++ vqrshrn.s32 d4, q2, #\shift_hv ++ vqmovun.s16 d4, q2 ++ subs \h, \h, #2 ++ vst1.16 {d4[0]}, [\dst, :16], \d_strd ++ vst1.16 {d4[1]}, [\ds2, :16], \d_strd ++ ble 0f ++ vmov d16, d18 ++ vmov d17, d19 ++ vmov d18, d20 ++ vmov d19, d21 ++ vmov d20, d22 ++ vmov d21, d23 ++ b 28b ++ ++0: ++ pop {r4-r11,pc} ++ ++L(\type\()_8tap_filter_2): ++ vld1.8 {d28}, [\sr2], \s_strd ++ vld1.8 {d30}, [\src], \s_strd ++ vext.8 d29, d28, d28, #1 ++ vext.8 d31, d30, d30, #1 ++ vmovl.u8 q13, d28 ++ vmovl.u8 q14, d29 ++ vmov d27, d28 ++ vmovl.u8 q14, d30 ++ vmovl.u8 q15, d31 ++ vtrn.32 d26, d28 ++ vtrn.32 d27, d30 ++ vmul.s16 d26, d26, d0[0] ++ vmla.s16 d26, d27, d0[1] ++ vmla.s16 d26, d28, d0[2] ++ vmla.s16 d26, d30, d0[3] ++ vrshr.s16 d26, d26, #2 ++ vext.8 d27, d26, d26, #4 ++ bx lr ++.endif ++ ++40: ++ add \mx, \mx, #2 ++ vld1.32 {d0[]}, [\mx] ++ bgt 480f ++ add \my, \my, #2 ++ vld1.32 {d2[]}, [\my] ++ sub \sr2, \src, #1 ++ sub \src, \sr2, \s_strd ++ add \ds2, \dst, \d_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vmovl.s8 q0, d0 ++ vmovl.s8 q1, d2 ++ ++ // 4x2, 4x4 hv ++ vld1.8 {d30}, [\src], \s_strd ++ vmovl.u8 q14, d30 ++ vext.8 d27, d28, d29, #2 ++ vext.8 d30, d28, d29, #4 ++ vext.8 d31, d28, d29, #6 ++ vmul.s16 d26, d28, d0[0] ++ vmla.s16 d26, d27, d0[1] ++ vmla.s16 d26, d30, d0[2] ++ vmla.s16 d26, d31, d0[3] ++ vrshr.s16 d16, d26, #2 ++ ++ bl L(\type\()_8tap_filter_4) ++ vmov d17, d26 ++ vmov d18, d27 ++ ++4: ++ bl L(\type\()_8tap_filter_4) ++ vmull.s16 q2, d16, d2[0] ++ vmlal.s16 q2, d17, d2[1] ++ vmlal.s16 q2, d18, d2[2] ++ vmlal.s16 q2, d26, d2[3] ++ vmull.s16 q3, d17, d2[0] ++ vmlal.s16 q3, d18, d2[1] ++ vmlal.s16 q3, d26, d2[2] ++ vmlal.s16 q3, d27, d2[3] ++ vqrshrn.s32 d4, q2, #\shift_hv ++ vqrshrn.s32 d6, q3, #\shift_hv ++ subs \h, \h, #2 ++.ifc \type, put ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d6, q3 ++ vst1.32 {d4[0]}, [\dst, :32], \d_strd ++ vst1.32 {d6[0]}, [\ds2, :32], \d_strd ++.else ++ vst1.16 {d4}, [\dst, :64], \d_strd ++ vst1.16 {d6}, [\ds2, :64], \d_strd ++.endif ++ ble 0f ++ vmov d16, d18 ++ vmov d17, d26 ++ vmov d18, d27 ++ b 4b ++ ++480: // 4x8, 4x16, 4x32 hv ++ vld1.8 {d2}, [\my] ++ sub \src, \src, #1 ++ sub \sr2, \src, \s_strd, lsl #1 ++ sub \src, \sr2, \s_strd ++ add \ds2, \dst, \d_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vmovl.s8 q0, d0 ++ vmovl.s8 q1, d2 ++ ++ vld1.8 {d30}, [\src], \s_strd ++ vmovl.u8 q14, d30 ++ vext.8 d27, d28, d29, #2 ++ vext.8 d30, d28, d29, #4 ++ vext.8 d31, d28, d29, #6 ++ vmul.s16 d26, d28, d0[0] ++ vmla.s16 d26, d27, d0[1] ++ vmla.s16 d26, d30, d0[2] ++ vmla.s16 d26, d31, d0[3] ++ vrshr.s16 d16, d26, #2 ++ ++ bl L(\type\()_8tap_filter_4) ++ vmov d17, d26 ++ vmov d18, d27 ++ bl L(\type\()_8tap_filter_4) ++ vmov d19, d26 ++ vmov d20, d27 ++ bl L(\type\()_8tap_filter_4) ++ vmov d21, d26 ++ vmov d22, d27 ++ ++48: ++ bl L(\type\()_8tap_filter_4) ++ vmull.s16 q2, d16, d2[0] ++ vmlal.s16 q2, d17, d2[1] ++ vmlal.s16 q2, d18, d2[2] ++ vmlal.s16 q2, d19, d2[3] ++ vmlal.s16 q2, d20, d3[0] ++ vmlal.s16 q2, d21, d3[1] ++ vmlal.s16 q2, d22, d3[2] ++ vmlal.s16 q2, d26, d3[3] ++ vmull.s16 q3, d17, d2[0] ++ vmlal.s16 q3, d18, d2[1] ++ vmlal.s16 q3, d19, d2[2] ++ vmlal.s16 q3, d20, d2[3] ++ vmlal.s16 q3, d21, d3[0] ++ vmlal.s16 q3, d22, d3[1] ++ vmlal.s16 q3, d26, d3[2] ++ vmlal.s16 q3, d27, d3[3] ++ vqrshrn.s32 d4, q2, #\shift_hv ++ vqrshrn.s32 d6, q3, #\shift_hv ++ subs \h, \h, #2 ++.ifc \type, put ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d6, q3 ++ vst1.32 {d4[0]}, [\dst, :32], \d_strd ++ vst1.32 {d6[0]}, [\ds2, :32], \d_strd ++.else ++ vst1.16 {d4}, [\dst, :64], \d_strd ++ vst1.16 {d6}, [\ds2, :64], \d_strd ++.endif ++ ble 0f ++ vmov d16, d18 ++ vmov d17, d19 ++ vmov d18, d20 ++ vmov d19, d21 ++ vmov d20, d22 ++ vmov d21, d26 ++ vmov d22, d27 ++ b 48b ++0: ++ pop {r4-r11,pc} ++ ++L(\type\()_8tap_filter_4): ++ vld1.8 {d30}, [\sr2], \s_strd ++ vld1.8 {d31}, [\src], \s_strd ++ vmovl.u8 q14, d30 ++ vext.8 d27, d28, d29, #2 ++ vext.8 d30, d28, d29, #4 ++ vext.8 d1, d28, d29, #6 ++ vmul.s16 d26, d28, d0[0] ++ vmla.s16 d26, d27, d0[1] ++ vmla.s16 d26, d30, d0[2] ++ vmla.s16 d26, d1, d0[3] ++ ++ vmovl.u8 q14, d31 ++ vext.8 d30, d28, d29, #2 ++ vext.8 d31, d28, d29, #4 ++ vext.8 d1, d28, d29, #6 ++ vmul.s16 d27, d28, d0[0] ++ vmla.s16 d27, d30, d0[1] ++ vmla.s16 d27, d31, d0[2] ++ vmla.s16 d27, d1, d0[3] ++ vrshr.s16 d26, d26, #2 ++ vrshr.s16 d27, d27, #2 ++ bx lr ++ ++80: ++160: ++320: ++ bgt 880f ++ vpush {q4-q7} ++ add \my, \my, #2 ++ vld1.8 {d0}, [\mx] ++ vld1.32 {d2[]}, [\my] ++ sub \src, \src, #3 ++ sub \src, \src, \s_strd ++ vmovl.s8 q0, d0 ++ vmovl.s8 q1, d2 ++ mov \my, \h ++ ++164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \d_strd, \d_strd, #1 ++ lsl \s_strd, \s_strd, #1 ++ ++ vld1.8 {q14}, [\src], \s_strd ++ vmovl.u8 q12, d28 ++ vmovl.u8 q13, d29 ++ vmul.s16 q10, q12, d0[0] ++.irpc i, 123 ++ vext.8 q14, q12, q13, #(2*\i) ++ vmla.s16 q10, q14, d0[\i] ++.endr ++.irpc i, 4567 ++ vext.8 q14, q12, q13, #(2*\i) ++ vmla.s16 q10, q14, d1[\i-4] ++.endr ++ vrshr.s16 q3, q10, #2 ++ ++ bl L(\type\()_8tap_filter_8) ++ vmov q4, q10 ++ vmov q5, q11 ++ ++8: ++ bl L(\type\()_8tap_filter_8) ++ vmull.s16 q12, d6, d2[0] ++ vmull.s16 q13, d7, d2[0] ++ vmull.s16 q14, d8, d2[0] ++ vmull.s16 q15, d9, d2[0] ++ vmlal.s16 q12, d8, d2[1] ++ vmlal.s16 q13, d9, d2[1] ++ vmlal.s16 q14, d10, d2[1] ++ vmlal.s16 q15, d11, d2[1] ++ vmlal.s16 q12, d10, d2[2] ++ vmlal.s16 q13, d11, d2[2] ++ vmlal.s16 q14, d20, d2[2] ++ vmlal.s16 q15, d21, d2[2] ++ vmlal.s16 q12, d20, d2[3] ++ vmlal.s16 q13, d21, d2[3] ++ vmlal.s16 q14, d22, d2[3] ++ vmlal.s16 q15, d23, d2[3] ++ vqrshrn.s32 d24, q12, #\shift_hv ++ vqrshrn.s32 d25, q13, #\shift_hv ++ vqrshrn.s32 d28, q14, #\shift_hv ++ vqrshrn.s32 d29, q15, #\shift_hv ++ subs \h, \h, #2 ++.ifc \type, put ++ vqmovun.s16 d24, q12 ++ vqmovun.s16 d28, q14 ++ vst1.8 {d24}, [\dst, :64], \d_strd ++ vst1.8 {d28}, [\ds2, :64], \d_strd ++.else ++ vst1.16 {q12}, [\dst, :128], \d_strd ++ vst1.16 {q14}, [\ds2, :128], \d_strd ++.endif ++ ble 9f ++ vmov q3, q5 ++ vmov q4, q10 ++ vmov q5, q11 ++ b 8b ++9: ++ subs \w, \w, #8 ++ ble 0f ++ asr \s_strd, \s_strd, #1 ++ asr \d_strd, \d_strd, #1 ++ mls \src, \s_strd, \my, \src ++ mls \dst, \d_strd, \my, \dst ++ sub \src, \src, \s_strd, lsl #2 ++ mov \h, \my ++ add \src, \src, #8 ++.ifc \type, put ++ add \dst, \dst, #8 ++.else ++ add \dst, \dst, #16 ++.endif ++ b 164b ++ ++880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv ++640: ++1280: ++ vpush {q4-q7} ++ vld1.8 {d0}, [\mx] ++ vld1.8 {d2}, [\my] ++ sub \src, \src, #3 ++ sub \src, \src, \s_strd ++ sub \src, \src, \s_strd, lsl #1 ++ vmovl.s8 q0, d0 ++ vmovl.s8 q1, d2 ++ mov \my, \h ++ ++168: ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \d_strd, \d_strd, #1 ++ lsl \s_strd, \s_strd, #1 ++ ++ vld1.8 {q14}, [\src], \s_strd ++ vmovl.u8 q12, d28 ++ vmovl.u8 q13, d29 ++ vmul.s16 q10, q12, d0[0] ++.irpc i, 123 ++ vext.8 q14, q12, q13, #(2*\i) ++ vmla.s16 q10, q14, d0[\i] ++.endr ++.irpc i, 4567 ++ vext.8 q14, q12, q13, #(2*\i) ++ vmla.s16 q10, q14, d1[\i-4] ++.endr ++ vrshr.s16 q3, q10, #2 ++ ++ bl L(\type\()_8tap_filter_8) ++ vmov q4, q10 ++ vmov q5, q11 ++ bl L(\type\()_8tap_filter_8) ++ vmov q6, q10 ++ vmov q7, q11 ++ bl L(\type\()_8tap_filter_8) ++ vmov q8, q10 ++ vmov q9, q11 ++ ++88: ++ bl L(\type\()_8tap_filter_8) ++ vmull.s16 q12, d6, d2[0] ++ vmull.s16 q13, d7, d2[0] ++ vmull.s16 q14, d8, d2[0] ++ vmull.s16 q15, d9, d2[0] ++ vmlal.s16 q12, d8, d2[1] ++ vmlal.s16 q13, d9, d2[1] ++ vmlal.s16 q14, d10, d2[1] ++ vmlal.s16 q15, d11, d2[1] ++ vmlal.s16 q12, d10, d2[2] ++ vmlal.s16 q13, d11, d2[2] ++ vmlal.s16 q14, d12, d2[2] ++ vmlal.s16 q15, d13, d2[2] ++ vmlal.s16 q12, d12, d2[3] ++ vmlal.s16 q13, d13, d2[3] ++ vmlal.s16 q14, d14, d2[3] ++ vmlal.s16 q15, d15, d2[3] ++ vmlal.s16 q12, d14, d3[0] ++ vmlal.s16 q13, d15, d3[0] ++ vmlal.s16 q14, d16, d3[0] ++ vmlal.s16 q15, d17, d3[0] ++ vmlal.s16 q12, d16, d3[1] ++ vmlal.s16 q13, d17, d3[1] ++ vmlal.s16 q14, d18, d3[1] ++ vmlal.s16 q15, d19, d3[1] ++ vmlal.s16 q12, d18, d3[2] ++ vmlal.s16 q13, d19, d3[2] ++ vmlal.s16 q14, d20, d3[2] ++ vmlal.s16 q15, d21, d3[2] ++ vmlal.s16 q12, d20, d3[3] ++ vmlal.s16 q13, d21, d3[3] ++ vmlal.s16 q14, d22, d3[3] ++ vmlal.s16 q15, d23, d3[3] ++ vqrshrn.s32 d24, q12, #\shift_hv ++ vqrshrn.s32 d25, q13, #\shift_hv ++ vqrshrn.s32 d28, q14, #\shift_hv ++ vqrshrn.s32 d29, q15, #\shift_hv ++ subs \h, \h, #2 ++.ifc \type, put ++ vqmovun.s16 d24, q12 ++ vqmovun.s16 d28, q14 ++ vst1.8 {d24}, [\dst, :64], \d_strd ++ vst1.8 {d28}, [\ds2, :64], \d_strd ++.else ++ vst1.16 {q12}, [\dst, :128], \d_strd ++ vst1.16 {q14}, [\ds2, :128], \d_strd ++.endif ++ ble 9f ++ vmov q3, q5 ++ vmov q4, q6 ++ vmov q5, q7 ++ vmov q6, q8 ++ vmov q7, q9 ++ vmov q8, q10 ++ vmov q9, q11 ++ b 88b ++9: ++ subs \w, \w, #8 ++ ble 0f ++ asr \s_strd, \s_strd, #1 ++ asr \d_strd, \d_strd, #1 ++ mls \src, \s_strd, \my, \src ++ mls \dst, \d_strd, \my, \dst ++ sub \src, \src, \s_strd, lsl #3 ++ mov \h, \my ++ add \src, \src, #8 ++.ifc \type, put ++ add \dst, \dst, #8 ++.else ++ add \dst, \dst, #16 ++.endif ++ b 168b ++0: ++ vpop {q4-q7} ++ pop {r4-r11,pc} ++ ++L(\type\()_8tap_filter_8): ++ vld1.8 {q14}, [\sr2], \s_strd ++ vld1.8 {q15}, [\src], \s_strd ++ vmovl.u8 q12, d28 ++ vmovl.u8 q13, d29 ++ vmul.s16 q10, q12, d0[0] ++.irpc i, 123 ++ vext.8 q14, q12, q13, #(2*\i) ++ vmla.s16 q10, q14, d0[\i] ++.endr ++.irpc i, 4567 ++ vext.8 q14, q12, q13, #(2*\i) ++ vmla.s16 q10, q14, d1[\i-4] ++.endr ++ vmovl.u8 q12, d30 ++ vmovl.u8 q13, d31 ++ vmul.s16 q11, q12, d0[0] ++.irpc i, 123 ++ vext.8 q14, q12, q13, #(2*\i) ++ vmla.s16 q11, q14, d0[\i] ++.endr ++.irpc i, 4567 ++ vext.8 q14, q12, q13, #(2*\i) ++ vmla.s16 q11, q14, d1[\i-4] ++.endr ++ vrshr.s16 q10, q10, #2 ++ vrshr.s16 q11, q11, #2 ++ bx lr ++endfunc ++ ++ ++function \type\()_bilin_8bpc_neon, export=1 ++ push {r4-r11,lr} ++ ldrd r4, r5, [sp, #36] ++ ldrd r6, r7, [sp, #44] ++ vdup.8 d1, \mx ++ vdup.8 d3, \my ++ rsb r8, \mx, #16 ++ rsb r9, \my, #16 ++ vdup.8 d0, r8 ++ vdup.8 d2, r9 ++.ifc \type, prep ++ lsl \d_strd, \w, #1 ++.endif ++ clz r8, \w ++ cmp \mx, #0 ++ sub r8, r8, #24 ++ bne L(\type\()_bilin_h) ++ cmp \my, #0 ++ bne L(\type\()_bilin_v) ++ b \type ++ ++L(\type\()_bilin_h): ++ cmp \my, #0 ++ bne L(\type\()_bilin_hv) ++ ++ adr r9, L(\type\()_bilin_h_tbl) ++ ldr r8, [r9, r8, lsl #2] ++ add r9, r9, r8 ++ bx r9 ++ ++ .align 2 ++L(\type\()_bilin_h_tbl): ++ .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB ++ .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB ++ .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB ++ .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB ++ .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB ++ .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB ++ .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB ++ ++20: // 2xN h ++.ifc \type, put ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \d_strd, \d_strd, #1 ++ lsl \s_strd, \s_strd, #1 ++2: ++ vld1.32 {d4[]}, [\src], \s_strd ++ vld1.32 {d6[]}, [\sr2], \s_strd ++ vext.8 d5, d4, d4, #1 ++ vext.8 d7, d6, d6, #1 ++ vtrn.16 q2, q3 ++ subs \h, \h, #2 ++ vmull.u8 q3, d4, d0 ++ vmlal.u8 q3, d5, d1 ++ vqrshrn.u16 d4, q3, #4 ++ vst1.16 {d4[0]}, [\dst, :16], \d_strd ++ vst1.16 {d4[1]}, [\ds2, :16], \d_strd ++ bgt 2b ++ pop {r4-r11,pc} ++.endif ++ ++40: // 4xN h ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \d_strd, \d_strd, #1 ++ lsl \s_strd, \s_strd, #1 ++4: ++ vld1.8 {d4}, [\src], \s_strd ++ vld1.8 {d6}, [\sr2], \s_strd ++ vext.8 d5, d4, d4, #1 ++ vext.8 d7, d6, d6, #1 ++ vtrn.32 q2, q3 ++ subs \h, \h, #2 ++ vmull.u8 q3, d4, d0 ++ vmlal.u8 q3, d5, d1 ++.ifc \type, put ++ vqrshrn.u16 d4, q3, #4 ++ vst1.32 {d4[0]}, [\dst, :32], \d_strd ++ vst1.32 {d4[1]}, [\ds2, :32], \d_strd ++.else ++ vst1.16 {d6}, [\dst, :64], \d_strd ++ vst1.16 {d7}, [\ds2, :64], \d_strd ++.endif ++ bgt 4b ++ pop {r4-r11,pc} ++ ++80: // 8xN h ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \d_strd, \d_strd, #1 ++ lsl \s_strd, \s_strd, #1 ++8: ++ vld1.8 {q8}, [\src], \s_strd ++ vld1.8 {q10}, [\sr2], \s_strd ++ vext.8 q9, q8, q8, #1 ++ vext.8 q11, q10, q10, #1 ++ subs \h, \h, #2 ++ vmull.u8 q8, d16, d0 ++ vmull.u8 q10, d20, d0 ++ vmlal.u8 q8, d18, d1 ++ vmlal.u8 q10, d22, d1 ++.ifc \type, put ++ vqrshrn.u16 d16, q8, #4 ++ vqrshrn.u16 d18, q10, #4 ++ vst1.8 {d16}, [\dst, :64], \d_strd ++ vst1.8 {d18}, [\ds2, :64], \d_strd ++.else ++ vst1.16 {q8}, [\dst, :128], \d_strd ++ vst1.16 {q10}, [\ds2, :128], \d_strd ++.endif ++ bgt 8b ++ pop {r4-r11,pc} ++160: ++320: ++640: ++1280: // 16xN, 32xN, ... h ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ ++ sub \s_strd, \s_strd, \w ++ sub \s_strd, \s_strd, #8 ++.ifc \type, put ++ lsl \d_strd, \d_strd, #1 ++ sub \d_strd, \d_strd, \w ++.endif ++161: ++ vld1.8 {d16}, [\src]! ++ vld1.8 {d22}, [\sr2]! ++ mov \mx, \w ++ ++16: ++ vld1.8 {d17,d18}, [\src]! ++ vld1.8 {d23,d24}, [\sr2]! ++ vext.8 q10, q8, q9, #1 ++ vext.8 q13, q11, q12, #1 ++ vmull.u8 q2, d16, d0 ++ vmull.u8 q3, d17, d0 ++ vmull.u8 q14, d22, d0 ++ vmull.u8 q15, d23, d0 ++ vmlal.u8 q2, d20, d1 ++ vmlal.u8 q3, d21, d1 ++ vmlal.u8 q14, d26, d1 ++ vmlal.u8 q15, d27, d1 ++ subs \mx, \mx, #16 ++.ifc \type, put ++ vqrshrn.u16 d4, q2, #4 ++ vqrshrn.u16 d5, q3, #4 ++ vqrshrn.u16 d28, q14, #4 ++ vqrshrn.u16 d29, q15, #4 ++ vst1.8 {q2}, [\dst, :128]! ++ vst1.8 {q14}, [\ds2, :128]! ++.else ++ vst1.16 {q2, q3}, [\dst, :128]! ++ vst1.16 {q14, q15}, [\ds2, :128]! ++.endif ++ ble 9f ++ ++ vmov d16, d18 ++ vmov d22, d24 ++ b 16b ++ ++9: ++ add \dst, \dst, \d_strd ++ add \ds2, \ds2, \d_strd ++ add \src, \src, \s_strd ++ add \sr2, \sr2, \s_strd ++ ++ subs \h, \h, #2 ++ bgt 161b ++ pop {r4-r11,pc} ++ ++L(\type\()_bilin_v): ++ cmp \h, #4 ++ adr r9, L(\type\()_bilin_v_tbl) ++ ldr r8, [r9, r8, lsl #2] ++ add r9, r9, r8 ++ bx r9 ++ ++ .align 2 ++L(\type\()_bilin_v_tbl): ++ .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB ++ .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB ++ .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB ++ .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB ++ .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB ++ .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB ++ .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB ++ ++20: // 2xN v ++.ifc \type, put ++ cmp \h, #2 ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ ++ // 2x2 v ++ vld1.16 {d16[]}, [\src], \s_strd ++ bgt 24f ++ vld1.16 {d17[]}, [\sr2], \s_strd ++ vld1.16 {d18[]}, [\src], \s_strd ++ vext.8 d16, d16, d17, #6 ++ vext.8 d17, d17, d18, #6 ++ vmull.u8 q2, d16, d2 ++ vmlal.u8 q2, d17, d3 ++ vqrshrn.u16 d4, q2, #4 ++ vst1.16 {d4[0]}, [\dst, :16] ++ vst1.16 {d4[1]}, [\ds2, :16] ++ pop {r4-r11,pc} ++24: // 2x4, 2x8, ... v ++ vld1.16 {d17[]}, [\sr2], \s_strd ++ vld1.16 {d18[]}, [\src], \s_strd ++ vld1.16 {d19[]}, [\sr2], \s_strd ++ vld1.16 {d20[]}, [\src], \s_strd ++ vext.8 d16, d16, d17, #6 ++ vext.8 d17, d17, d18, #6 ++ vext.8 d18, d18, d19, #6 ++ vext.8 d19, d19, d20, #6 ++ vtrn.32 d16, d18 ++ vtrn.32 d17, d19 ++ vmull.u8 q2, d16, d2 ++ vmlal.u8 q2, d17, d3 ++ subs \h, \h, #4 ++ vqrshrn.u16 d4, q2, #4 ++ vst1.16 {d4[0]}, [\dst, :16], \d_strd ++ vst1.16 {d4[1]}, [\ds2, :16], \d_strd ++ vst1.16 {d4[2]}, [\dst, :16], \d_strd ++ vst1.16 {d4[3]}, [\ds2, :16], \d_strd ++ ble 0f ++ vmov d16, d20 ++ b 24b ++0: ++ pop {r4-r11,pc} ++.endif ++ ++40: // 4xN v ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vld1.32 {d16[]}, [\src], \s_strd ++4: ++ vld1.32 {d17[]}, [\sr2], \s_strd ++ vld1.32 {d18[]}, [\src], \s_strd ++ vext.8 d16, d16, d17, #4 ++ vext.8 d17, d17, d18, #4 ++ vmull.u8 q2, d16, d2 ++ vmlal.u8 q2, d17, d3 ++ subs \h, \h, #2 ++.ifc \type, put ++ vqrshrn.u16 d4, q2, #4 ++ vst1.32 {d4[0]}, [\dst, :32], \d_strd ++ vst1.32 {d4[1]}, [\ds2, :32], \d_strd ++.else ++ vst1.16 {d4}, [\dst, :64], \d_strd ++ vst1.16 {d5}, [\ds2, :64], \d_strd ++.endif ++ ble 0f ++ vmov d16, d18 ++ b 4b ++0: ++ pop {r4-r11,pc} ++ ++80: // 8xN v ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ vld1.8 {d16}, [\src], \s_strd ++8: ++ vld1.8 {d17}, [\sr2], \s_strd ++ vld1.8 {d18}, [\src], \s_strd ++ vmull.u8 q2, d16, d2 ++ vmull.u8 q3, d17, d2 ++ vmlal.u8 q2, d17, d3 ++ vmlal.u8 q3, d18, d3 ++ subs \h, \h, #2 ++.ifc \type, put ++ vqrshrn.u16 d4, q2, #4 ++ vqrshrn.u16 d6, q3, #4 ++ vst1.8 {d4}, [\dst, :64], \d_strd ++ vst1.8 {d6}, [\ds2, :64], \d_strd ++.else ++ vst1.16 {q2}, [\dst, :128], \d_strd ++ vst1.16 {q3}, [\ds2, :128], \d_strd ++.endif ++ ble 0f ++ vmov d16, d18 ++ b 8b ++0: ++ pop {r4-r11,pc} ++ ++160: // 16xN, 32xN, ... ++320: ++640: ++1280: ++ mov \my, \h ++1: ++ add \ds2, \dst, \d_strd ++ add \sr2, \src, \s_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ ++ vld1.8 {q8}, [\src], \s_strd ++2: ++ vld1.8 {q9}, [\sr2], \s_strd ++ vld1.8 {q10}, [\src], \s_strd ++ vmull.u8 q12, d16, d2 ++ vmull.u8 q13, d17, d2 ++ vmull.u8 q14, d18, d2 ++ vmull.u8 q15, d19, d2 ++ vmlal.u8 q12, d18, d3 ++ vmlal.u8 q13, d19, d3 ++ vmlal.u8 q14, d20, d3 ++ vmlal.u8 q15, d21, d3 ++ subs \h, \h, #2 ++.ifc \type, put ++ vqrshrn.u16 d24, q12, #4 ++ vqrshrn.u16 d25, q13, #4 ++ vqrshrn.u16 d28, q14, #4 ++ vqrshrn.u16 d29, q15, #4 ++ vst1.8 {q12}, [\dst, :128], \d_strd ++ vst1.8 {q14}, [\ds2, :128], \d_strd ++.else ++ vst1.16 {q12, q13}, [\dst, :128], \d_strd ++ vst1.16 {q14, q15}, [\ds2, :128], \d_strd ++.endif ++ ble 9f ++ vmov q8, q10 ++ b 2b ++9: ++ subs \w, \w, #16 ++ ble 0f ++ asr \s_strd, \s_strd, #1 ++ asr \d_strd, \d_strd, #1 ++ mls \src, \s_strd, \my, \src ++ mls \dst, \d_strd, \my, \dst ++ sub \src, \src, \s_strd, lsl #1 ++ mov \h, \my ++ add \src, \src, #16 ++.ifc \type, put ++ add \dst, \dst, #16 ++.else ++ add \dst, \dst, #32 ++.endif ++ b 1b ++0: ++ pop {r4-r11,pc} ++ ++L(\type\()_bilin_hv): ++ vmovl.u8 q2, d2 ++ vmovl.u8 q3, d3 ++ adr r9, L(\type\()_bilin_hv_tbl) ++ ldr r8, [r9, r8, lsl #2] ++ add r9, r9, r8 ++ bx r9 ++ ++ .align 2 ++L(\type\()_bilin_hv_tbl): ++ .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB ++ .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB ++ .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB ++ .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB ++ .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB ++ .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB ++ .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB ++ ++20: // 2xN hv ++.ifc \type, put ++ add \sr2, \src, \s_strd ++ add \ds2, \dst, \d_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ ++ vld1.32 {d28[]}, [\src], \s_strd ++ vext.8 d29, d28, d28, #1 ++ vmull.u8 q8, d28, d0 ++ vmlal.u8 q8, d29, d1 ++ ++2: ++ vld1.32 {d28[]}, [\sr2], \s_strd ++ vld1.32 {d30[]}, [\src], \s_strd ++ vext.8 d29, d28, d28, #1 ++ vext.8 d31, d30, d30, #1 ++ vtrn.16 d28, d30 ++ vtrn.16 d29, d31 ++ vmull.u8 q9, d28, d0 ++ vmlal.u8 q9, d29, d1 ++ ++ vtrn.32 d16, d18 ++ ++ vmul.u16 d20, d16, d4 ++ vmla.u16 d20, d19, d6 ++ vqrshrn.u16 d20, q10, #8 ++ subs \h, \h, #2 ++ vst1.16 {d20[0]}, [\dst, :16], \d_strd ++ vst1.16 {d20[1]}, [\ds2, :16], \d_strd ++ ble 0f ++ vtrn.32 d19, d16 ++ b 2b ++0: ++ pop {r4-r11,pc} ++.endif ++ ++40: // 4xN hv ++ add \sr2, \src, \s_strd ++ add \ds2, \dst, \d_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ ++ vld1.8 {d28}, [\src], \s_strd ++ vext.8 d29, d28, d28, #1 ++ vmull.u8 q8, d28, d0 ++ vmlal.u8 q8, d29, d1 ++ ++4: ++ vld1.8 {d28}, [\sr2], \s_strd ++ vld1.8 {d30}, [\src], \s_strd ++ vext.8 d29, d28, d28, #1 ++ vext.8 d31, d30, d30, #1 ++ vtrn.32 d28, d30 ++ vtrn.32 d29, d31 ++ vmull.u8 q9, d28, d0 ++ vmlal.u8 q9, d29, d1 ++ ++ vmov d17, d18 ++ ++ vmul.u16 q10, q8, q2 ++ vmla.u16 q10, q9, q3 ++ subs \h, \h, #2 ++.ifc \type, put ++ vqrshrn.u16 d20, q10, #8 ++ vst1.32 {d20[0]}, [\dst, :32], \d_strd ++ vst1.32 {d20[1]}, [\ds2, :32], \d_strd ++.else ++ vrshr.u16 q10, q10, #4 ++ vst1.16 {d20}, [\dst, :64], \d_strd ++ vst1.16 {d21}, [\ds2, :64], \d_strd ++.endif ++ ble 0f ++ vmov d16, d19 ++ b 4b ++0: ++ pop {r4-r11,pc} ++ ++80: // 8xN, 16xN, ... hv ++160: ++320: ++640: ++1280: ++ mov \my, \h ++ ++1: ++ add \sr2, \src, \s_strd ++ add \ds2, \dst, \d_strd ++ lsl \s_strd, \s_strd, #1 ++ lsl \d_strd, \d_strd, #1 ++ ++ vld1.8 {q12}, [\src], \s_strd ++ vext.8 q13, q12, q12, #1 ++ vmull.u8 q8, d24, d0 ++ vmlal.u8 q8, d26, d1 ++ ++2: ++ vld1.8 {q12}, [\sr2], \s_strd ++ vld1.8 {q14}, [\src], \s_strd ++ vext.8 q13, q12, q12, #1 ++ vext.8 q15, q14, q14, #1 ++ vmull.u8 q9, d24, d0 ++ vmlal.u8 q9, d26, d1 ++ vmull.u8 q10, d28, d0 ++ vmlal.u8 q10, d30, d1 ++ ++ vmul.u16 q8, q8, q2 ++ vmla.u16 q8, q9, q3 ++ vmul.u16 q9, q9, q2 ++ vmla.u16 q9, q10, q3 ++ subs \h, \h, #2 ++.ifc \type, put ++ vqrshrn.u16 d16, q8, #8 ++ vqrshrn.u16 d18, q9, #8 ++ vst1.8 {d16}, [\dst, :64], \d_strd ++ vst1.8 {d18}, [\ds2, :64], \d_strd ++.else ++ vrshr.u16 q8, q8, #4 ++ vrshr.u16 q9, q9, #4 ++ vst1.16 {q8}, [\dst, :128], \d_strd ++ vst1.16 {q9}, [\ds2, :128], \d_strd ++.endif ++ ble 9f ++ vmov q8, q10 ++ b 2b ++9: ++ subs \w, \w, #8 ++ ble 0f ++ asr \s_strd, \s_strd, #1 ++ asr \d_strd, \d_strd, #1 ++ mls \src, \s_strd, \my, \src ++ mls \dst, \d_strd, \my, \dst ++ sub \src, \src, \s_strd, lsl #1 ++ mov \h, \my ++ add \src, \src, #8 ++.ifc \type, put ++ add \dst, \dst, #8 ++.else ++ add \dst, \dst, #16 ++.endif ++ b 1b ++0: ++ pop {r4-r11,pc} ++endfunc ++.endm ++ ++filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10 ++filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6 +diff --git third_party/dav1d/src/arm/32/util.S third_party/dav1d/src/arm/32/util.S +index 7f4405b0d513..02d763cc5c3e 100644 +--- third_party/dav1d/src/arm/32/util.S ++++ third_party/dav1d/src/arm/32/util.S +@@ -26,25 +26,41 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +-#ifndef __DAVID_SRC_ARM_32_UTIL_S__ +-#define __DAVID_SRC_ARM_32_UTIL_S__ ++#ifndef DAV1D_SRC_ARM_32_UTIL_S ++#define DAV1D_SRC_ARM_32_UTIL_S + + #include "config.h" + #include "src/arm/asm.S" + +-.macro movrel rd, val +-#if defined(PIC) ++.macro movrel rd, val, offset=0 ++#if defined(PIC) && defined(__APPLE__) + ldr \rd, 1f + b 2f + 1: +-@ FIXME: thumb +- .word \val - (2f + 8) ++ .word 3f - (2f + 8 - 4 * CONFIG_THUMB) ++2: ++ ldr \rd, [pc, \rd] ++.if \offset < 0 ++ sub \rd, \rd, #-(\offset) ++.elseif \offset > 0 ++ add \rd, \rd, #\offset ++.endif ++ .non_lazy_symbol_pointer ++3: ++ .indirect_symbol \val ++ .word 0 ++ .text ++#elif defined(PIC) ++ ldr \rd, 1f ++ b 2f ++1: ++ .word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB) + 2: + add \rd, \rd, pc + #else +- movw \rd, #:lower16:\val +- movt \rd, #:upper16:\val ++ movw \rd, #:lower16:\val+\offset ++ movt \rd, #:upper16:\val+\offset + #endif + .endm + +-#endif /* __DAVID_SRC_ARM_32_UTIL_S__ */ ++#endif /* DAV1D_SRC_ARM_32_UTIL_S */ +diff --git third_party/dav1d/src/arm/64/cdef.S third_party/dav1d/src/arm/64/cdef.S +new file mode 100644 +index 000000000000..333bddef6059 +--- /dev/null ++++ third_party/dav1d/src/arm/64/cdef.S +@@ -0,0 +1,603 @@ ++/* ++ * Copyright © 2018, VideoLAN and dav1d authors ++ * Copyright © 2019, Martin Storsjo ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright notice, this ++ * list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above copyright notice, ++ * this list of conditions and the following disclaimer in the documentation ++ * and/or other materials provided with the distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "src/arm/asm.S" ++#include "util.S" ++ ++.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret ++ tst w6, #1 // CDEF_HAVE_LEFT ++ b.eq 2f ++ // CDEF_HAVE_LEFT ++ sub \s1, \s1, #2 ++ sub \s2, \s2, #2 ++ tst w6, #2 // CDEF_HAVE_RIGHT ++ b.eq 1f ++ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ++ ldr \rn\()0, [\s1] ++ ldr s1, [\s1, #\w] ++ ldr \rn\()2, [\s2] ++ ldr s3, [\s2, #\w] ++ uxtl v0.8h, v0.8b ++ uxtl v1.8h, v1.8b ++ uxtl v2.8h, v2.8b ++ uxtl v3.8h, v3.8b ++ str \rw\()0, [x0] ++ str d1, [x0, #2*\w] ++ add x0, x0, #2*\stride ++ str \rw\()2, [x0] ++ str d3, [x0, #2*\w] ++.if \ret ++ ret ++.else ++ add x0, x0, #2*\stride ++ b 3f ++.endif ++ ++1: ++ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ++ ldr \rn\()0, [\s1] ++ ldr h1, [\s1, #\w] ++ ldr \rn\()2, [\s2] ++ ldr h3, [\s2, #\w] ++ uxtl v0.8h, v0.8b ++ uxtl v1.8h, v1.8b ++ uxtl v2.8h, v2.8b ++ uxtl v3.8h, v3.8b ++ str \rw\()0, [x0] ++ str s1, [x0, #2*\w] ++ str s31, [x0, #2*\w+4] ++ add x0, x0, #2*\stride ++ str \rw\()2, [x0] ++ str s3, [x0, #2*\w] ++ str s31, [x0, #2*\w+4] ++.if \ret ++ ret ++.else ++ add x0, x0, #2*\stride ++ b 3f ++.endif ++ ++2: ++ // !CDEF_HAVE_LEFT ++ tst w6, #2 // CDEF_HAVE_RIGHT ++ b.eq 1f ++ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ++ ldr \rn\()0, [\s1] ++ ldr h1, [\s1, #\w] ++ ldr \rn\()2, [\s2] ++ ldr h3, [\s2, #\w] ++ uxtl v0.8h, v0.8b ++ uxtl v1.8h, v1.8b ++ uxtl v2.8h, v2.8b ++ uxtl v3.8h, v3.8b ++ str s31, [x0] ++ stur \rw\()0, [x0, #4] ++ str s1, [x0, #4+2*\w] ++ add x0, x0, #2*\stride ++ str s31, [x0] ++ stur \rw\()2, [x0, #4] ++ str s3, [x0, #4+2*\w] ++.if \ret ++ ret ++.else ++ add x0, x0, #2*\stride ++ b 3f ++.endif ++ ++1: ++ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ++ ldr \rn\()0, [\s1] ++ ldr \rn\()1, [\s2] ++ uxtl v0.8h, v0.8b ++ uxtl v1.8h, v1.8b ++ str s31, [x0] ++ stur \rw\()0, [x0, #4] ++ str s31, [x0, #4+2*\w] ++ add x0, x0, #2*\stride ++ str s31, [x0] ++ stur \rw\()1, [x0, #4] ++ str s31, [x0, #4+2*\w] ++.if \ret ++ ret ++.else ++ add x0, x0, #2*\stride ++.endif ++3: ++.endm ++ ++// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src, ++// ptrdiff_t src_stride, const pixel (*left)[2], ++// /*const*/ pixel *const top[2], int h, ++// enum CdefEdgeFlags edges); ++ ++.macro padding_func w, stride, rn, rw ++function cdef_padding\w\()_neon, export=1 ++ movi v30.8h, #0x80, lsl #8 ++ mov v31.16b, v30.16b ++ sub x0, x0, #2*(2*\stride+2) ++ tst w6, #4 // CDEF_HAVE_TOP ++ b.ne 1f ++ // !CDEF_HAVE_TOP ++ st1 {v30.8h, v31.8h}, [x0], #32 ++.if \w == 8 ++ st1 {v30.8h, v31.8h}, [x0], #32 ++.endif ++ b 3f ++1: ++ // CDEF_HAVE_TOP ++ ldr x8, [x4] ++ ldr x9, [x4, #8] ++ pad_top_bottom x8, x9, \w, \stride, \rn, \rw, 0 ++ ++ // Middle section ++3: ++ tst w6, #1 // CDEF_HAVE_LEFT ++ b.eq 2f ++ // CDEF_HAVE_LEFT ++ tst w6, #2 // CDEF_HAVE_RIGHT ++ b.eq 1f ++ // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ++0: ++ ld1 {v0.h}[0], [x3], #2 ++ ldr \rn\()1, [x1] ++ ldr h2, [x1, #\w] ++ add x1, x1, x2 ++ subs w5, w5, #1 ++ uxtl v0.8h, v0.8b ++ uxtl v1.8h, v1.8b ++ uxtl v2.8h, v2.8b ++ str s0, [x0] ++ stur \rw\()1, [x0, #4] ++ str s2, [x0, #4+2*\w] ++ add x0, x0, #2*\stride ++ b.gt 0b ++ b 3f ++1: ++ // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ++ ld1 {v0.h}[0], [x3], #2 ++.if \w == 8 ++ ld1 {v1.8b}, [x1], x2 ++.else ++ ld1 {v1.s}[0], [x1], x2 ++.endif ++ subs w5, w5, #1 ++ uxtl v0.8h, v0.8b ++ uxtl v1.8h, v1.8b ++ str s0, [x0] ++ stur \rw\()1, [x0, #4] ++ str s31, [x0, #4+2*\w] ++ add x0, x0, #2*\stride ++ b.gt 1b ++ b 3f ++2: ++ tst w6, #2 // CDEF_HAVE_RIGHT ++ b.eq 1f ++ // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ++0: ++ ldr \rn\()0, [x1] ++ ldr h1, [x1, #\w] ++ add x1, x1, x2 ++ subs w5, w5, #1 ++ uxtl v0.8h, v0.8b ++ uxtl v1.8h, v1.8b ++ str s31, [x0] ++ stur \rw\()0, [x0, #4] ++ str s1, [x0, #4+2*\w] ++ add x0, x0, #2*\stride ++ b.gt 0b ++ b 3f ++1: ++ // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ++.if \w == 8 ++ ld1 {v0.8b}, [x1], x2 ++.else ++ ld1 {v0.s}[0], [x1], x2 ++.endif ++ subs w5, w5, #1 ++ uxtl v0.8h, v0.8b ++ str s31, [x0] ++ stur \rw\()0, [x0, #4] ++ str s31, [x0, #4+2*\w] ++ add x0, x0, #2*\stride ++ b.gt 1b ++ ++3: ++ tst w6, #8 // CDEF_HAVE_BOTTOM ++ b.ne 1f ++ // !CDEF_HAVE_BOTTOM ++ st1 {v30.8h, v31.8h}, [x0], #32 ++.if \w == 8 ++ st1 {v30.8h, v31.8h}, [x0], #32 ++.endif ++ ret ++1: ++ // CDEF_HAVE_BOTTOM ++ add x9, x1, x2 ++ pad_top_bottom x1, x9, \w, \stride, \rn, \rw, 1 ++endfunc ++.endm ++ ++padding_func 8, 16, d, q ++padding_func 4, 8, s, d ++ ++.macro dir_table w, stride ++const directions\w ++ .byte -1 * \stride + 1, -2 * \stride + 2 ++ .byte 0 * \stride + 1, -1 * \stride + 2 ++ .byte 0 * \stride + 1, 0 * \stride + 2 ++ .byte 0 * \stride + 1, 1 * \stride + 2 ++ .byte 1 * \stride + 1, 2 * \stride + 2 ++ .byte 1 * \stride + 0, 2 * \stride + 1 ++ .byte 1 * \stride + 0, 2 * \stride + 0 ++ .byte 1 * \stride + 0, 2 * \stride - 1 ++// Repeated, to avoid & 7 ++ .byte -1 * \stride + 1, -2 * \stride + 2 ++ .byte 0 * \stride + 1, -1 * \stride + 2 ++ .byte 0 * \stride + 1, 0 * \stride + 2 ++ .byte 0 * \stride + 1, 1 * \stride + 2 ++ .byte 1 * \stride + 1, 2 * \stride + 2 ++ .byte 1 * \stride + 0, 2 * \stride + 1 ++endconst ++.endm ++ ++dir_table 8, 16 ++dir_table 4, 8 ++ ++const pri_taps ++ .byte 4, 2, 3, 3 ++endconst ++ ++.macro load_px d1, d2, w ++.if \w == 8 ++ add x6, x2, w9, sxtb #1 // x + off ++ sub x9, x2, w9, sxtb #1 // x - off ++ ld1 {\d1\().8h}, [x6] // p0 ++ ld1 {\d2\().8h}, [x9] // p1 ++.else ++ add x6, x2, w9, sxtb #1 // x + off ++ sub x9, x2, w9, sxtb #1 // x - off ++ ld1 {\d1\().4h}, [x6] // p0 ++ add x6, x6, #2*8 // += stride ++ ld1 {\d2\().4h}, [x9] // p1 ++ add x9, x9, #2*8 // += stride ++ ld1 {\d1\().d}[1], [x6] // p0 ++ ld1 {\d2\().d}[1], [x9] // p1 ++.endif ++.endm ++.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap ++ umin v2.8h, v2.8h, \s1\().8h ++ smax v3.8h, v3.8h, \s1\().8h ++ umin v2.8h, v2.8h, \s2\().8h ++ smax v3.8h, v3.8h, \s2\().8h ++ ++ cbz \threshold, 3f ++ uabd v16.8h, v0.8h, \s1\().8h // abs(diff) ++ uabd v20.8h, v0.8h, \s2\().8h // abs(diff) ++ ushl v17.8h, v16.8h, \shift // abs(diff) >> shift ++ ushl v21.8h, v20.8h, \shift // abs(diff) >> shift ++ uqsub v17.8h, \thresh_vec, v17.8h // imax(0, threshold - (abs(diff) >> shift)) ++ uqsub v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift)) ++ cmhi v18.8h, v0.8h, \s1\().8h // px > p0 ++ cmhi v22.8h, v0.8h, \s2\().8h // px > p1 ++ umin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax()) ++ umin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax()) ++ dup v19.8h, \tap // taps[k] ++ neg v16.8h, v17.8h // -imin() ++ neg v20.8h, v21.8h // -imin() ++ bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() ++ bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() ++ mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain() ++ mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain() ++3: ++.endm ++ ++// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride, ++// const uint16_t *tmp, int pri_strength, ++// int sec_strength, int dir, int damping, int h); ++.macro filter w ++function cdef_filter\w\()_neon, export=1 ++ movrel x8, pri_taps ++ and w9, w3, #1 ++ add x8, x8, w9, uxtw #1 ++ movrel x9, directions\w ++ add x5, x9, w5, uxtw #1 ++ movi v30.8h, #15 ++ dup v28.8h, w6 // damping ++ ++ dup v25.8h, w3 // threshold ++ dup v27.8h, w4 // threshold ++ clz v24.8h, v25.8h // clz(threshold) ++ clz v26.8h, v27.8h // clz(threshold) ++ sub v24.8h, v30.8h, v24.8h // ulog2(threshold) ++ sub v26.8h, v30.8h, v26.8h // ulog2(threshold) ++ uqsub v24.8h, v28.8h, v24.8h // shift = imax(0, damping - ulog2(threshold)) ++ uqsub v26.8h, v28.8h, v26.8h // shift = imax(0, damping - ulog2(threshold)) ++ neg v24.8h, v24.8h // -shift ++ neg v26.8h, v26.8h // -shift ++ ++1: ++.if \w == 8 ++ ld1 {v0.8h}, [x2] // px ++.else ++ add x12, x2, #2*8 ++ ld1 {v0.4h}, [x2] // px ++ ld1 {v0.d}[1], [x12] // px ++.endif ++ ++ movi v1.8h, #0 // sum ++ mov v2.16b, v0.16b // min ++ mov v3.16b, v0.16b // max ++ ++ // Instead of loading sec_taps 2, 1 from memory, just set it ++ // to 2 initially and decrease for the second round. ++ mov w11, #2 // sec_taps[0] ++ ++2: ++ ldrb w9, [x5] // off1 ++ ++ load_px v4, v5, \w ++ ++ add x5, x5, #4 // +2*2 ++ ldrb w9, [x5] // off2 ++ load_px v6, v7, \w ++ ++ ldrb w10, [x8] // *pri_taps ++ ++ handle_pixel v4, v5, w3, v25.8h, v24.8h, w10 ++ ++ add x5, x5, #8 // +2*4 ++ ldrb w9, [x5] // off3 ++ load_px v4, v5, \w ++ ++ handle_pixel v6, v7, w4, v27.8h, v26.8h, w11 ++ ++ handle_pixel v4, v5, w4, v27.8h, v26.8h, w11 ++ ++ sub x5, x5, #11 // x8 -= 2*(2+4); x8 += 1; ++ subs w11, w11, #1 // sec_tap-- (value) ++ add x8, x8, #1 // pri_taps++ (pointer) ++ b.ne 2b ++ ++ sshr v4.8h, v1.8h, #15 // -(sum < 0) ++ add v1.8h, v1.8h, v4.8h // sum - (sum < 0) ++ srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 ++ add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4 ++ smin v0.8h, v0.8h, v3.8h ++ smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max) ++ xtn v0.8b, v0.8h ++.if \w == 8 ++ add x2, x2, #2*16 // tmp += tmp_stride ++ subs w7, w7, #1 // h-- ++ st1 {v0.8b}, [x0], x1 ++.else ++ st1 {v0.s}[0], [x0], x1 ++ add x2, x2, #2*16 // tmp += 2*tmp_stride ++ subs w7, w7, #2 // h -= 2 ++ st1 {v0.s}[1], [x0], x1 ++.endif ++ ++ // Reset pri_taps/sec_taps back to the original point ++ sub x5, x5, #2 ++ sub x8, x8, #2 ++ ++ b.gt 1b ++ ret ++endfunc ++.endm ++ ++filter 8 ++filter 4 ++ ++const div_table ++ .short 840, 420, 280, 210, 168, 140, 120, 105 ++endconst ++ ++const alt_fact ++ .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 ++endconst ++ ++// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride, ++// unsigned *const var) ++function cdef_find_dir_neon, export=1 ++ sub sp, sp, #32 // cost ++ mov w3, #8 ++ movi v31.16b, #128 ++ movi v30.16b, #0 ++ movi v1.8h, #0 // v0-v1 sum_diag[0] ++ movi v3.8h, #0 // v2-v3 sum_diag[1] ++ movi v5.8h, #0 // v4-v5 sum_hv[0-1] ++ movi v7.8h, #0 // v6-v7 sum_alt[0] ++ movi v17.8h, #0 // v16-v17 sum_alt[1] ++ movi v18.8h, #0 // v18-v19 sum_alt[2] ++ movi v19.8h, #0 ++ movi v21.8h, #0 // v20-v21 sum_alt[3] ++ ++.irpc i, 01234567 ++ ld1 {v26.8b}, [x0], x1 ++ usubl v26.8h, v26.8b, v31.8b ++ ++ addv h25, v26.8h // [y] ++ rev64 v27.8h, v26.8h ++ addp v28.8h, v26.8h, v30.8h // [(x >> 1)] ++ add v5.8h, v5.8h, v26.8h // sum_hv[1] ++ ext v27.16b, v27.16b, v27.16b, #8 // [-x] ++ rev64 v29.4h, v28.4h // [-(x >> 1)] ++ ins v4.h[\i], v25.h[0] // sum_hv[0] ++ ++.if \i == 0 ++ mov v0.16b, v26.16b // sum_diag[0] ++ mov v2.16b, v27.16b // sum_diag[1] ++ mov v6.16b, v28.16b // sum_alt[0] ++ mov v16.16b, v29.16b // sum_alt[1] ++.else ++ ext v22.16b, v30.16b, v26.16b, #(16-2*\i) ++ ext v23.16b, v26.16b, v30.16b, #(16-2*\i) ++ ext v24.16b, v30.16b, v27.16b, #(16-2*\i) ++ ext v25.16b, v27.16b, v30.16b, #(16-2*\i) ++ add v0.8h, v0.8h, v22.8h // sum_diag[0] ++ add v1.8h, v1.8h, v23.8h // sum_diag[0] ++ add v2.8h, v2.8h, v24.8h // sum_diag[1] ++ add v3.8h, v3.8h, v25.8h // sum_diag[1] ++ ext v22.16b, v30.16b, v28.16b, #(16-2*\i) ++ ext v23.16b, v28.16b, v30.16b, #(16-2*\i) ++ ext v24.16b, v30.16b, v29.16b, #(16-2*\i) ++ ext v25.16b, v29.16b, v30.16b, #(16-2*\i) ++ add v6.8h, v6.8h, v22.8h // sum_alt[0] ++ add v7.8h, v7.8h, v23.8h // sum_alt[0] ++ add v16.8h, v16.8h, v24.8h // sum_alt[1] ++ add v17.8h, v17.8h, v25.8h // sum_alt[1] ++.endif ++.if \i < 6 ++ ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) ++ ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) ++ add v18.8h, v18.8h, v22.8h // sum_alt[2] ++ add v19.8h, v19.8h, v23.8h // sum_alt[2] ++.else ++ add v18.8h, v18.8h, v26.8h // sum_alt[2] ++.endif ++.if \i == 0 ++ mov v20.16b, v26.16b // sum_alt[3] ++.elseif \i == 1 ++ add v20.8h, v20.8h, v26.8h // sum_alt[3] ++.else ++ ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) ++ ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) ++ add v20.8h, v20.8h, v24.8h // sum_alt[3] ++ add v21.8h, v21.8h, v25.8h // sum_alt[3] ++.endif ++.endr ++ ++ movi v31.4s, #105 ++ ++ smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0] ++ smlal2 v26.4s, v4.8h, v4.8h ++ smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1] ++ smlal2 v27.4s, v5.8h, v5.8h ++ mul v26.4s, v26.4s, v31.4s // cost[2] *= 105 ++ mul v27.4s, v27.4s, v31.4s // cost[6] *= 105 ++ addv s4, v26.4s // cost[2] ++ addv s5, v27.4s // cost[6] ++ ++ rev64 v1.8h, v1.8h ++ rev64 v3.8h, v3.8h ++ ext v1.16b, v1.16b, v1.16b, #8 // sum_diag[0][15-n] ++ ext v3.16b, v3.16b, v3.16b, #8 // sum_diag[1][15-n] ++ ext v1.16b, v1.16b, v1.16b, #2 // sum_diag[0][14-n] ++ ext v3.16b, v3.16b, v3.16b, #2 // sum_diag[1][14-n] ++ ++ str s4, [sp, #2*4] // cost[2] ++ str s5, [sp, #6*4] // cost[6] ++ ++ movrel x4, div_table ++ ld1 {v31.8h}, [x4] ++ ++ smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0] ++ smull2 v23.4s, v0.8h, v0.8h ++ smlal v22.4s, v1.4h, v1.4h ++ smlal2 v23.4s, v1.8h, v1.8h ++ smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1] ++ smull2 v25.4s, v2.8h, v2.8h ++ smlal v24.4s, v3.4h, v3.4h ++ smlal2 v25.4s, v3.8h, v3.8h ++ uxtl v30.4s, v31.4h // div_table ++ uxtl2 v31.4s, v31.8h ++ mul v22.4s, v22.4s, v30.4s // cost[0] ++ mla v22.4s, v23.4s, v31.4s // cost[0] ++ mul v24.4s, v24.4s, v30.4s // cost[4] ++ mla v24.4s, v25.4s, v31.4s // cost[4] ++ addv s0, v22.4s // cost[0] ++ addv s2, v24.4s // cost[4] ++ ++ movrel x5, alt_fact ++ ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105 ++ ++ str s0, [sp, #0*4] // cost[0] ++ str s2, [sp, #4*4] // cost[4] ++ ++ uxtl v29.4s, v29.4h // div_table[2*m+1] + 105 ++ uxtl v30.4s, v30.4h ++ uxtl v31.4s, v31.4h ++ ++.macro cost_alt d1, d2, s1, s2, s3, s4 ++ smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n] ++ smull2 v23.4s, \s1\().8h, \s1\().8h ++ smull v24.4s, \s2\().4h, \s2\().4h ++ smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n] ++ smull2 v26.4s, \s3\().8h, \s3\().8h ++ smull v27.4s, \s4\().4h, \s4\().4h ++ mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact ++ mla v22.4s, v23.4s, v30.4s ++ mla v22.4s, v24.4s, v31.4s ++ mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact ++ mla v25.4s, v26.4s, v30.4s ++ mla v25.4s, v27.4s, v31.4s ++ addv \d1, v22.4s // *cost_ptr ++ addv \d2, v25.4s // *cost_ptr ++.endm ++ cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3] ++ str s6, [sp, #1*4] // cost[1] ++ str s16, [sp, #3*4] // cost[3] ++ cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7] ++ str s18, [sp, #5*4] // cost[5] ++ str s20, [sp, #7*4] // cost[7] ++ ++ mov w0, #0 // best_dir ++ mov w1, v0.s[0] // best_cost ++ mov w3, #1 // n ++ ++ mov w4, v6.s[0] ++ ++.macro find_best s1, s2, s3 ++.ifnb \s2 ++ mov w5, \s2\().s[0] ++.endif ++ cmp w4, w1 // cost[n] > best_cost ++ csel w0, w3, w0, gt // best_dir = n ++ csel w1, w4, w1, gt // best_cost = cost[n] ++.ifnb \s2 ++ add w3, w3, #1 // n++ ++ cmp w5, w1 // cost[n] > best_cost ++ mov w4, \s3\().s[0] ++ csel w0, w3, w0, gt // best_dir = n ++ csel w1, w5, w1, gt // best_cost = cost[n] ++ add w3, w3, #1 // n++ ++.endif ++.endm ++ find_best v6, v4, v16 ++ find_best v16, v2, v18 ++ find_best v18, v5, v20 ++ find_best v20 ++ ++ eor w3, w0, #4 // best_dir ^4 ++ ldr w4, [sp, w3, uxtw #2] ++ sub w1, w1, w4 // best_cost - cost[best_dir ^ 4] ++ lsr w1, w1, #10 ++ str w1, [x2] // *var ++ ++ add sp, sp, #32 ++ ret ++endfunc +diff --git third_party/dav1d/src/arm/64/looprestoration.S third_party/dav1d/src/arm/64/looprestoration.S +index 6badfe0018f2..7fc34d98874c 100644 +--- third_party/dav1d/src/arm/64/looprestoration.S ++++ third_party/dav1d/src/arm/64/looprestoration.S +@@ -52,7 +52,7 @@ function wiener_filter_h_neon, export=1 + add x13, x2, x3 + lsl x3, x3, #1 + +- // Subtract the width from mid_strid3 ++ // Subtract the width from mid_stride + sub x10, x10, w5, uxtw #1 + + // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. +@@ -224,31 +224,25 @@ function wiener_filter_h_neon, export=1 + mov v3.16b, v28.16b + mov v5.16b, v29.16b + br x11 ++44: // 4 pixels valid in v2/v4, fill the high half with padding. ++ ins v2.d[1], v3.d[0] ++ ins v4.d[1], v5.d[0] ++ b 88f + // Shift v2 right, shifting out invalid pixels, + // shift v2 left to the original offset, shifting in padding pixels. +-44: // 4 pixels valid +- ext v2.16b, v2.16b, v2.16b, #8 +- ext v2.16b, v2.16b, v3.16b, #8 +- ext v4.16b, v4.16b, v4.16b, #8 +- ext v4.16b, v4.16b, v5.16b, #8 +- b 88f + 55: // 5 pixels valid + ext v2.16b, v2.16b, v2.16b, #10 + ext v2.16b, v2.16b, v3.16b, #6 + ext v4.16b, v4.16b, v4.16b, #10 + ext v4.16b, v4.16b, v5.16b, #6 + b 88f +-66: // 6 pixels valid +- ext v2.16b, v2.16b, v2.16b, #12 +- ext v2.16b, v2.16b, v3.16b, #4 +- ext v4.16b, v4.16b, v4.16b, #12 +- ext v4.16b, v4.16b, v5.16b, #4 ++66: // 6 pixels valid, fill the upper 2 pixels with padding. ++ ins v2.s[3], v3.s[0] ++ ins v4.s[3], v5.s[0] + b 88f +-77: // 7 pixels valid +- ext v2.16b, v2.16b, v2.16b, #14 +- ext v2.16b, v2.16b, v3.16b, #2 +- ext v4.16b, v4.16b, v4.16b, #14 +- ext v4.16b, v4.16b, v5.16b, #2 ++77: // 7 pixels valid, fill the last pixel with padding. ++ ins v2.h[7], v3.h[0] ++ ins v4.h[7], v5.h[0] + b 88f + + L(variable_shift_tbl): +@@ -282,19 +276,15 @@ L(variable_shift_tbl): + addv h6, v6.8h + addv h7, v7.8h + dup v16.4h, v2.h[3] +- dup v17.4h, v4.h[3] ++ ins v16.h[1], v4.h[3] ++ ins v6.h[1], v7.h[0] + shl v16.4h, v16.4h, #7 +- shl v17.4h, v17.4h, #7 + sub v16.4h, v16.4h, v30.4h +- sub v17.4h, v17.4h, v30.4h + sqadd v6.4h, v6.4h, v16.4h +- sqadd v7.4h, v7.4h, v17.4h + sshr v6.4h, v6.4h, #3 +- sshr v7.4h, v7.4h, #3 + add v6.4h, v6.4h, v31.4h +- add v7.4h, v7.4h, v31.4h + st1 {v6.h}[0], [x0], #2 +- st1 {v7.h}[0], [x12], #2 ++ st1 {v6.h}[1], [x12], #2 + subs w5, w5, #1 + ext v2.16b, v2.16b, v3.16b, #2 + ext v4.16b, v4.16b, v5.16b, #2 +@@ -322,8 +312,7 @@ endfunc + function wiener_filter_v_neon, export=1 + mov w8, w4 + ld1 {v0.8h}, [x5] +- mov w9, #128 +- dup v1.8h, w9 ++ movi v1.8h, #128 + add v1.8h, v1.8h, v0.8h + + // Calculate the number of rows to move back when looping vertically +@@ -437,7 +426,7 @@ function wiener_filter_v_neon, export=1 + mov v21.16b, v19.16b + mov v22.16b, v19.16b + b 8f +-62: // 2 rows in total, v19 already loaded, load v20 and pad that into v20-v23. ++62: // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23. + ld1 {v20.8h}, [x2], x7 + mov v21.16b, v20.16b + mov v22.16b, v20.16b +@@ -503,9 +492,8 @@ function copy_narrow_neon, export=1 + add x7, x0, x1 + lsl x1, x1, #1 + 18: +- cmp w4, #8 +- b.lt 110f + subs w4, w4, #8 ++ b.lt 110f + ld1 {v0.8b}, [x2], #8 + st1 {v0.b}[0], [x0], x1 + st1 {v0.b}[1], [x7], x1 +@@ -518,12 +506,13 @@ function copy_narrow_neon, export=1 + b.le 0f + b 18b + 110: ++ add w4, w4, #8 + asr x1, x1, #1 + 11: + subs w4, w4, #1 + ld1 {v0.b}[0], [x2], #1 + st1 {v0.b}[0], [x0], x1 +- b.ge 11b ++ b.gt 11b + 0: + ret + +@@ -531,9 +520,8 @@ function copy_narrow_neon, export=1 + add x7, x0, x1 + lsl x1, x1, #1 + 24: +- cmp w4, #4 +- b.lt 210f + subs w4, w4, #4 ++ b.lt 210f + ld1 {v0.4h}, [x2], #8 + st1 {v0.h}[0], [x0], x1 + st1 {v0.h}[1], [x7], x1 +@@ -542,12 +530,13 @@ function copy_narrow_neon, export=1 + b.le 0f + b 24b + 210: ++ add w4, w4, #4 + asr x1, x1, #1 + 22: + subs w4, w4, #1 + ld1 {v0.h}[0], [x2], #2 + st1 {v0.h}[0], [x0], x1 +- b.ge 22b ++ b.gt 22b + 0: + ret + +@@ -566,9 +555,8 @@ function copy_narrow_neon, export=1 + add x7, x0, x1 + lsl x1, x1, #1 + 42: +- cmp w4, #2 +- b.lt 41f + subs w4, w4, #2 ++ b.lt 41f + ld1 {v0.2s}, [x2], #8 + st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[1], [x7], x1 +diff --git third_party/dav1d/src/arm/64/mc.S third_party/dav1d/src/arm/64/mc.S +index ecdbbcce2ec5..839b189919b0 100644 +--- third_party/dav1d/src/arm/64/mc.S ++++ third_party/dav1d/src/arm/64/mc.S +@@ -546,58 +546,61 @@ endfunc + mla \d\wd, \s2\wd, v0.h[2] + mla \d\wd, \s3\wd, v0.h[3] + .endm ++// Interleaving the mul/mla chains actually hurts performance ++// significantly on Cortex A53, thus keeping mul/mla tightly ++// chained like this. + .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 + mul \d0\().8h, \s0\().8h, v0.h[0] +- mul \d1\().8h, \s1\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] +- mla \d1\().8h, \s2\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] +- mla \d1\().8h, \s3\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] +- mla \d1\().8h, \s4\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] +- mla \d1\().8h, \s5\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] +- mla \d1\().8h, \s6\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] +- mla \d1\().8h, \s7\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] ++ mul \d1\().8h, \s1\().8h, v0.h[0] ++ mla \d1\().8h, \s2\().8h, v0.h[1] ++ mla \d1\().8h, \s3\().8h, v0.h[2] ++ mla \d1\().8h, \s4\().8h, v0.h[3] ++ mla \d1\().8h, \s5\().8h, v0.h[4] ++ mla \d1\().8h, \s6\().8h, v0.h[5] ++ mla \d1\().8h, \s7\().8h, v0.h[6] + mla \d1\().8h, \s8\().8h, v0.h[7] + .endm + .macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 + mul \d0\().8h, \s0\().8h, v0.h[0] +- mul \d1\().8h, \s2\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] +- mla \d1\().8h, \s3\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] +- mla \d1\().8h, \s4\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] +- mla \d1\().8h, \s5\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] +- mla \d1\().8h, \s6\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] +- mla \d1\().8h, \s7\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] +- mla \d1\().8h, \s8\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] ++ mul \d1\().8h, \s2\().8h, v0.h[0] ++ mla \d1\().8h, \s3\().8h, v0.h[1] ++ mla \d1\().8h, \s4\().8h, v0.h[2] ++ mla \d1\().8h, \s5\().8h, v0.h[3] ++ mla \d1\().8h, \s6\().8h, v0.h[4] ++ mla \d1\().8h, \s7\().8h, v0.h[5] ++ mla \d1\().8h, \s8\().8h, v0.h[6] + mla \d1\().8h, \s9\().8h, v0.h[7] + .endm + .macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 + mul \d0\().8h, \s0\().8h, v0.h[0] +- mul \d1\().8h, \s4\().8h, v0.h[0] + mla \d0\().8h, \s1\().8h, v0.h[1] +- mla \d1\().8h, \s5\().8h, v0.h[1] + mla \d0\().8h, \s2\().8h, v0.h[2] +- mla \d1\().8h, \s6\().8h, v0.h[2] + mla \d0\().8h, \s3\().8h, v0.h[3] +- mla \d1\().8h, \s7\().8h, v0.h[3] + mla \d0\().8h, \s4\().8h, v0.h[4] +- mla \d1\().8h, \s8\().8h, v0.h[4] + mla \d0\().8h, \s5\().8h, v0.h[5] +- mla \d1\().8h, \s9\().8h, v0.h[5] + mla \d0\().8h, \s6\().8h, v0.h[6] +- mla \d1\().8h, \s10\().8h, v0.h[6] + mla \d0\().8h, \s7\().8h, v0.h[7] ++ mul \d1\().8h, \s4\().8h, v0.h[0] ++ mla \d1\().8h, \s5\().8h, v0.h[1] ++ mla \d1\().8h, \s6\().8h, v0.h[2] ++ mla \d1\().8h, \s7\().8h, v0.h[3] ++ mla \d1\().8h, \s8\().8h, v0.h[4] ++ mla \d1\().8h, \s9\().8h, v0.h[5] ++ mla \d1\().8h, \s10\().8h, v0.h[6] + mla \d1\().8h, \s11\().8h, v0.h[7] + .endm + .macro sqrshrun_b shift, r0, r1, r2, r3 +@@ -628,7 +631,7 @@ endfunc + st1 {\reg\().h}[3], [x8], \strd + .endif + .endm +-.macro st_s strd, r0, r1, r2, r3 ++.macro st_s strd, r0, r1 + st1 {\r0\().s}[0], [x0], \strd + st1 {\r0\().s}[1], [x8], \strd + .ifnb \r1 +@@ -636,7 +639,7 @@ endfunc + st1 {\r1\().s}[1], [x8], \strd + .endif + .endm +-.macro st_d strd, r0, r1, r2, r3 ++.macro st_d strd, r0, r1 + st1 {\r0\().d}[0], [x0], \strd + st1 {\r0\().d}[1], [x8], \strd + .ifnb \r1 +@@ -644,13 +647,13 @@ endfunc + st1 {\r1\().d}[1], [x8], \strd + .endif + .endm +-.macro shift_store_4 type, strd, r0, r1, r2, r3 ++.macro shift_store_4 type, strd, r0, r1 + .ifc \type, put +- sqrshrun_b 6, \r0, \r1, \r2, \r3 +- st_s \strd, \r0, \r1, \r2, \r3 ++ sqrshrun_b 6, \r0, \r1 ++ st_s \strd, \r0, \r1 + .else +- srshr_h 2, \r0, \r1, \r2, \r3 +- st_d \strd, \r0, \r1, \r2, \r3 ++ srshr_h 2, \r0, \r1 ++ st_d \strd, \r0, \r1 + .endif + .endm + .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 +@@ -742,7 +745,7 @@ function \type\()_8tap + + L(\type\()_8tap_h): + cmp \w, #4 +- ubfm w9, \mx, #7, #13 ++ ubfx w9, \mx, #7, #7 + and \mx, \mx, #0x7f + b.le 4f + mov \mx, w9 +@@ -965,7 +968,7 @@ L(\type\()_8tap_h_tbl): + + L(\type\()_8tap_v): + cmp \h, #4 +- ubfm w9, \my, #7, #13 ++ ubfx w9, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w9 +@@ -1216,7 +1219,7 @@ L(\type\()_8tap_v): + 160: + b.gt 1680b + +- // 16x4 v ++ // 16x2, 16x4 v + add \xmy, \xmy, #2 + ld1 {v0.s}[0], [\xmy] + sub \src, \src, \s_strd +@@ -1269,7 +1272,7 @@ L(\type\()_8tap_v_tbl): + + L(\type\()_8tap_hv): + cmp \h, #4 +- ubfm w9, \my, #7, #13 ++ ubfx w9, \my, #7, #7 + and \my, \my, #0x7f + b.le 4f + mov \my, w9 +@@ -1304,21 +1307,19 @@ L(\type\()_8tap_hv): + ext v29.16b, v28.16b, v28.16b, #2 + mul v28.4h, v28.4h, v0.4h + mul v29.4h, v29.4h, v0.4h +- addv h28, v28.4h +- addv h29, v29.4h +- trn1 v16.4h, v28.4h, v29.4h +- srshr v16.4h, v16.4h, #2 ++ addp v28.4h, v28.4h, v29.4h ++ addp v16.4h, v28.4h, v28.4h ++ srshr v16.4h, v16.4h, #2 + bl L(\type\()_8tap_filter_2) + + trn1 v16.2s, v16.2s, v28.2s +- trn1 v17.2s, v28.2s, v30.2s +- mov v18.8b, v30.8b ++ mov v17.8b, v28.8b + + 2: + bl L(\type\()_8tap_filter_2) + +- trn1 v18.2s, v18.2s, v28.2s +- trn1 v19.2s, v28.2s, v30.2s ++ ext v18.8b, v17.8b, v28.8b, #4 ++ mov v19.8b, v28.8b + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] +@@ -1332,7 +1333,6 @@ L(\type\()_8tap_hv): + b.le 0f + mov v16.8b, v18.8b + mov v17.8b, v19.8b +- mov v18.8b, v30.8b + b 2b + + 280: // 2x8, 2x16, 2x32 hv +@@ -1352,28 +1352,24 @@ L(\type\()_8tap_hv): + ext v29.16b, v28.16b, v28.16b, #2 + mul v28.4h, v28.4h, v0.4h + mul v29.4h, v29.4h, v0.4h +- addv h28, v28.4h +- addv h29, v29.4h +- trn1 v16.4h, v28.4h, v29.4h +- srshr v16.4h, v16.4h, #2 ++ addp v28.4h, v28.4h, v29.4h ++ addp v16.4h, v28.4h, v28.4h ++ srshr v16.4h, v16.4h, #2 + + bl L(\type\()_8tap_filter_2) + trn1 v16.2s, v16.2s, v28.2s +- trn1 v17.2s, v28.2s, v30.2s +- mov v18.8b, v30.8b ++ mov v17.8b, v28.8b + bl L(\type\()_8tap_filter_2) +- trn1 v18.2s, v18.2s, v28.2s +- trn1 v19.2s, v28.2s, v30.2s +- mov v20.8b, v30.8b ++ ext v18.8b, v17.8b, v28.8b, #4 ++ mov v19.8b, v28.8b + bl L(\type\()_8tap_filter_2) +- trn1 v20.2s, v20.2s, v28.2s +- trn1 v21.2s, v28.2s, v30.2s +- mov v22.8b, v30.8b ++ ext v20.8b, v19.8b, v28.8b, #4 ++ mov v21.8b, v28.8b + + 28: + bl L(\type\()_8tap_filter_2) +- trn1 v22.2s, v22.2s, v28.2s +- trn1 v23.2s, v28.2s, v30.2s ++ ext v22.8b, v21.8b, v28.8b, #4 ++ mov v23.8b, v28.8b + smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] +@@ -1395,7 +1391,6 @@ L(\type\()_8tap_hv): + mov v19.8b, v21.8b + mov v20.8b, v22.8b + mov v21.8b, v23.8b +- mov v22.8b, v30.8b + b 28b + + 0: +@@ -1417,7 +1412,6 @@ L(\type\()_8tap_filter_2): + mla v27.4h, v30.4h, v0.h[2] + mla v27.4h, v31.4h, v0.h[3] + srshr v28.4h, v27.4h, #2 +- trn2 v30.2s, v28.2s, v28.2s + ret + .endif + +@@ -1453,14 +1447,17 @@ L(\type\()_8tap_filter_2): + mov v18.8b, v29.8b + + 4: +- smull v2.4s, v16.4h, v1.h[0] + bl L(\type\()_8tap_filter_4) +- smull v3.4s, v17.4h, v1.h[0] ++ // Interleaving the mul/mla chains actually hurts performance ++ // significantly on Cortex A53, thus keeping mul/mla tightly ++ // chained like this. ++ smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] +- smlal v3.4s, v18.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] +- smlal v3.4s, v28.4h, v1.h[2] + smlal v2.4s, v28.4h, v1.h[3] ++ smull v3.4s, v17.4h, v1.h[0] ++ smlal v3.4s, v18.4h, v1.h[1] ++ smlal v3.4s, v28.4h, v1.h[2] + smlal v3.4s, v29.4h, v1.h[3] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn v3.4h, v3.4s, #\shift_hv +@@ -1514,22 +1511,22 @@ L(\type\()_8tap_filter_2): + mov v22.8b, v29.8b + + 48: +- smull v2.4s, v16.4h, v1.h[0] + bl L(\type\()_8tap_filter_4) +- smull v3.4s, v17.4h, v1.h[0] ++ smull v2.4s, v16.4h, v1.h[0] + smlal v2.4s, v17.4h, v1.h[1] +- smlal v3.4s, v18.4h, v1.h[1] + smlal v2.4s, v18.4h, v1.h[2] +- smlal v3.4s, v19.4h, v1.h[2] + smlal v2.4s, v19.4h, v1.h[3] +- smlal v3.4s, v20.4h, v1.h[3] + smlal v2.4s, v20.4h, v1.h[4] +- smlal v3.4s, v21.4h, v1.h[4] + smlal v2.4s, v21.4h, v1.h[5] +- smlal v3.4s, v22.4h, v1.h[5] + smlal v2.4s, v22.4h, v1.h[6] +- smlal v3.4s, v28.4h, v1.h[6] + smlal v2.4s, v28.4h, v1.h[7] ++ smull v3.4s, v17.4h, v1.h[0] ++ smlal v3.4s, v18.4h, v1.h[1] ++ smlal v3.4s, v19.4h, v1.h[2] ++ smlal v3.4s, v20.4h, v1.h[3] ++ smlal v3.4s, v21.4h, v1.h[4] ++ smlal v3.4s, v22.4h, v1.h[5] ++ smlal v3.4s, v28.4h, v1.h[6] + smlal v3.4s, v29.4h, v1.h[7] + sqrshrn v2.4h, v2.4s, #\shift_hv + sqrshrn v3.4h, v3.4s, #\shift_hv +@@ -2331,3 +2328,191 @@ endfunc + + filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 + filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 ++ ++.macro load_filter_row dst, src, inc ++ asr w13, \src, #10 ++ ldr \dst, [x11, w13, sxtw #3] ++ add \src, \src, \inc ++.endm ++ ++function warp_filter_horz ++ add w12, w5, #512 ++ ++ ld1 {v16.8b, v17.8b}, [x2], x3 ++ ++ load_filter_row d0, w12, w7 ++ load_filter_row d1, w12, w7 ++ load_filter_row d2, w12, w7 ++ sxtl v0.8h, v0.8b ++ load_filter_row d3, w12, w7 ++ sxtl v1.8h, v1.8b ++ load_filter_row d4, w12, w7 ++ sxtl v2.8h, v2.8b ++ load_filter_row d5, w12, w7 ++ sxtl v3.8h, v3.8b ++ load_filter_row d6, w12, w7 ++ sxtl v4.8h, v4.8b ++ load_filter_row d7, w12, w7 ++ sxtl v5.8h, v5.8b ++ sxtl v6.8h, v6.8b ++ sxtl v7.8h, v7.8b ++ ++ uxtl v16.8h, v16.8b ++ uxtl v17.8h, v17.8b ++ ++ ext v18.16b, v16.16b, v17.16b, #2*1 ++ mul v23.8h, v16.8h, v0.8h ++ ext v19.16b, v16.16b, v17.16b, #2*2 ++ mul v18.8h, v18.8h, v1.8h ++ ext v20.16b, v16.16b, v17.16b, #2*3 ++ mul v19.8h, v19.8h, v2.8h ++ ext v21.16b, v16.16b, v17.16b, #2*4 ++ saddlp v23.4s, v23.8h ++ mul v20.8h, v20.8h, v3.8h ++ ext v22.16b, v16.16b, v17.16b, #2*5 ++ saddlp v18.4s, v18.8h ++ mul v21.8h, v21.8h, v4.8h ++ saddlp v19.4s, v19.8h ++ mul v22.8h, v22.8h, v5.8h ++ saddlp v20.4s, v20.8h ++ addv s23, v23.4s ++ saddlp v21.4s, v21.8h ++ addv s18, v18.4s ++ saddlp v22.4s, v22.8h ++ addv s19, v19.4s ++ trn1 v18.2s, v23.2s, v18.2s ++ addv s20, v20.4s ++ ext v23.16b, v16.16b, v17.16b, #2*6 ++ trn1 v19.2s, v19.2s, v20.2s ++ addv s21, v21.4s ++ mul v23.8h, v23.8h, v6.8h ++ ext v20.16b, v16.16b, v17.16b, #2*7 ++ addv s22, v22.4s ++ mul v20.8h, v20.8h, v7.8h ++ saddlp v23.4s, v23.8h ++ trn1 v21.2s, v21.2s, v22.2s ++ saddlp v20.4s, v20.8h ++ addv s23, v23.4s ++ addv s20, v20.4s ++ trn1 v20.2s, v23.2s, v20.2s ++ trn1 v18.2d, v18.2d, v19.2d ++ trn1 v20.2d, v21.2d, v20.2d ++ ++ add w5, w5, w8 ++ ++ rshrn v16.4h, v18.4s, #3 ++ rshrn2 v16.8h, v20.4s, #3 ++ ++ ret ++endfunc ++ ++// void dav1d_warp_affine_8x8_8bpc_neon( ++// pixel *dst, const ptrdiff_t dst_stride, ++// const pixel *src, const ptrdiff_t src_stride, ++// const int16_t *const abcd, int mx, int my) ++.macro warp t, shift ++function warp_affine_8x8\t\()_8bpc_neon, export=1 ++ ldr x4, [x4] ++ ubfx x7, x4, #0, #16 ++ ubfx x8, x4, #16, #16 ++ ubfx x9, x4, #32, #16 ++ ubfx x4, x4, #48, #16 ++ sxth w7, w7 ++ sxth w8, w8 ++ sxth w9, w9 ++ sxth w4, w4 ++ mov w10, #8 ++ sub x2, x2, x3, lsl #1 ++ sub x2, x2, x3 ++ sub x2, x2, #3 ++ movrel x11, X(mc_warp_filter), 64*8 ++ mov x15, x30 ++.ifnb \t ++ lsl x1, x1, #1 ++.endif ++ ++ bl warp_filter_horz ++ mov v24.16b, v16.16b ++ bl warp_filter_horz ++ mov v25.16b, v16.16b ++ bl warp_filter_horz ++ mov v26.16b, v16.16b ++ bl warp_filter_horz ++ mov v27.16b, v16.16b ++ bl warp_filter_horz ++ mov v28.16b, v16.16b ++ bl warp_filter_horz ++ mov v29.16b, v16.16b ++ bl warp_filter_horz ++ mov v30.16b, v16.16b ++ ++1: ++ add w14, w6, #512 ++ bl warp_filter_horz ++ mov v31.16b, v16.16b ++ ++ load_filter_row d0, w14, w9 ++ load_filter_row d1, w14, w9 ++ load_filter_row d2, w14, w9 ++ load_filter_row d3, w14, w9 ++ load_filter_row d4, w14, w9 ++ load_filter_row d5, w14, w9 ++ load_filter_row d6, w14, w9 ++ load_filter_row d7, w14, w9 ++ transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 ++ sxtl v0.8h, v0.8b ++ sxtl v1.8h, v1.8b ++ sxtl v2.8h, v2.8b ++ sxtl v3.8h, v3.8b ++ sxtl v4.8h, v4.8b ++ sxtl v5.8h, v5.8b ++ sxtl v6.8h, v6.8b ++ sxtl v7.8h, v7.8b ++ ++ // This ordering of smull/smlal/smull2/smlal2 is highly ++ // beneficial for Cortex A53 here. ++ smull v16.4s, v24.4h, v0.4h ++ smlal v16.4s, v25.4h, v1.4h ++ smlal v16.4s, v26.4h, v2.4h ++ smlal v16.4s, v27.4h, v3.4h ++ smlal v16.4s, v28.4h, v4.4h ++ smlal v16.4s, v29.4h, v5.4h ++ smlal v16.4s, v30.4h, v6.4h ++ smlal v16.4s, v31.4h, v7.4h ++ smull2 v17.4s, v24.8h, v0.8h ++ smlal2 v17.4s, v25.8h, v1.8h ++ smlal2 v17.4s, v26.8h, v2.8h ++ smlal2 v17.4s, v27.8h, v3.8h ++ smlal2 v17.4s, v28.8h, v4.8h ++ smlal2 v17.4s, v29.8h, v5.8h ++ smlal2 v17.4s, v30.8h, v6.8h ++ smlal2 v17.4s, v31.8h, v7.8h ++ ++ mov v24.16b, v25.16b ++ mov v25.16b, v26.16b ++ sqrshrn v16.4h, v16.4s, #\shift ++ mov v26.16b, v27.16b ++ sqrshrn2 v16.8h, v17.4s, #\shift ++ mov v27.16b, v28.16b ++ mov v28.16b, v29.16b ++.ifb \t ++ sqxtun v16.8b, v16.8h ++.endif ++ mov v29.16b, v30.16b ++ mov v30.16b, v31.16b ++ subs w10, w10, #1 ++.ifnb \t ++ st1 {v16.8h}, [x0], x1 ++.else ++ st1 {v16.8b}, [x0], x1 ++.endif ++ ++ add w6, w6, w4 ++ b.gt 1b ++ ++ br x15 ++endfunc ++.endm ++ ++warp , 11 ++warp t, 7 +diff --git third_party/dav1d/src/arm/64/util.S third_party/dav1d/src/arm/64/util.S +index fa8f0bea2a55..d7857be4e039 100644 +--- third_party/dav1d/src/arm/64/util.S ++++ third_party/dav1d/src/arm/64/util.S +@@ -26,8 +26,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +-#ifndef __DAVID_SRC_ARM_64_UTIL_S__ +-#define __DAVID_SRC_ARM_64_UTIL_S__ ++#ifndef DAV1D_SRC_ARM_64_UTIL_S ++#define DAV1D_SRC_ARM_64_UTIL_S + + #include "config.h" + #include "src/arm/asm.S" +@@ -59,4 +59,33 @@ + #endif + .endm + +-#endif /* __DAVID_SRC_ARM_64_UTIL_S__ */ ++.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 ++ trn1 \r8\().8b, \r0\().8b, \r1\().8b ++ trn2 \r9\().8b, \r0\().8b, \r1\().8b ++ trn1 \r1\().8b, \r2\().8b, \r3\().8b ++ trn2 \r3\().8b, \r2\().8b, \r3\().8b ++ trn1 \r0\().8b, \r4\().8b, \r5\().8b ++ trn2 \r5\().8b, \r4\().8b, \r5\().8b ++ trn1 \r2\().8b, \r6\().8b, \r7\().8b ++ trn2 \r7\().8b, \r6\().8b, \r7\().8b ++ ++ trn1 \r4\().4h, \r0\().4h, \r2\().4h ++ trn2 \r2\().4h, \r0\().4h, \r2\().4h ++ trn1 \r6\().4h, \r5\().4h, \r7\().4h ++ trn2 \r7\().4h, \r5\().4h, \r7\().4h ++ trn1 \r5\().4h, \r9\().4h, \r3\().4h ++ trn2 \r9\().4h, \r9\().4h, \r3\().4h ++ trn1 \r3\().4h, \r8\().4h, \r1\().4h ++ trn2 \r8\().4h, \r8\().4h, \r1\().4h ++ ++ trn1 \r0\().2s, \r3\().2s, \r4\().2s ++ trn2 \r4\().2s, \r3\().2s, \r4\().2s ++ trn1 \r1\().2s, \r5\().2s, \r6\().2s ++ trn2 \r5\().2s, \r5\().2s, \r6\().2s ++ trn2 \r6\().2s, \r8\().2s, \r2\().2s ++ trn1 \r2\().2s, \r8\().2s, \r2\().2s ++ trn1 \r3\().2s, \r9\().2s, \r7\().2s ++ trn2 \r7\().2s, \r9\().2s, \r7\().2s ++.endm ++ ++#endif /* DAV1D_SRC_ARM_64_UTIL_S */ +diff --git third_party/dav1d/src/arm/asm.S third_party/dav1d/src/arm/asm.S +index e5722cf2ff2e..682e6ad805e2 100644 +--- third_party/dav1d/src/arm/asm.S ++++ third_party/dav1d/src/arm/asm.S +@@ -25,14 +25,15 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_ARM_ASM_S__ +-#define __DAV1D_SRC_ARM_ASM_S__ ++#ifndef DAV1D_SRC_ARM_ASM_S ++#define DAV1D_SRC_ARM_ASM_S + + #include "config.h" + + #if ARCH_ARM + .syntax unified + #ifdef __ELF__ ++ .arch armv7-a + .fpu neon + .eabi_attribute 10, 0 // suppress Tag_FP_arch + .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch +@@ -114,7 +115,9 @@ EXTERN\name: + #endif + .purgem endconst + .endm +-#if !defined(__MACH__) ++#if defined(_WIN32) ++ .section .rdata ++#elif !defined(__MACH__) + .section .rodata + #else + .const_data +@@ -131,4 +134,4 @@ EXTERN\name: + + #define X(x) CONCAT(EXTERN, x) + +-#endif /* __DAV1D_SRC_ARM_ASM_S__ */ ++#endif /* DAV1D_SRC_ARM_ASM_S */ +diff --git third_party/dav1d/src/arm/cdef_init_tmpl.c third_party/dav1d/src/arm/cdef_init_tmpl.c +new file mode 100644 +index 000000000000..a7d58ff8fa35 +--- /dev/null ++++ third_party/dav1d/src/arm/cdef_init_tmpl.c +@@ -0,0 +1,86 @@ ++/* ++ * Copyright © 2018, VideoLAN and dav1d authors ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright notice, this ++ * list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above copyright notice, ++ * this list of conditions and the following disclaimer in the documentation ++ * and/or other materials provided with the distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "common/attributes.h" ++#include "src/cpu.h" ++#include "src/cdef.h" ++ ++#if BITDEPTH == 8 && ARCH_AARCH64 ++decl_cdef_dir_fn(dav1d_cdef_find_dir_neon); ++ ++void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src, ++ ptrdiff_t src_stride, const pixel (*left)[2], ++ /*const*/ pixel *const top[2], int h, ++ enum CdefEdgeFlags edges); ++void dav1d_cdef_padding8_neon(uint16_t *tmp, const pixel *src, ++ ptrdiff_t src_stride, const pixel (*left)[2], ++ /*const*/ pixel *const top[2], int h, ++ enum CdefEdgeFlags edges); ++ ++void dav1d_cdef_filter4_neon(pixel *dst, ptrdiff_t dst_stride, ++ const uint16_t *tmp, int pri_strength, ++ int sec_strength, int dir, int damping, int h); ++void dav1d_cdef_filter8_neon(pixel *dst, ptrdiff_t dst_stride, ++ const uint16_t *tmp, int pri_strength, ++ int sec_strength, int dir, int damping, int h); ++ ++#define DEFINE_FILTER(w, h, tmp_stride) \ ++static void \ ++cdef_filter_##w##x##h##_neon(pixel *dst, \ ++ const ptrdiff_t stride, \ ++ const pixel (*left)[2], \ ++ /*const*/ pixel *const top[2], \ ++ const int pri_strength, \ ++ const int sec_strength, \ ++ const int dir, \ ++ const int damping, \ ++ const enum CdefEdgeFlags edges) \ ++{ \ ++ ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride,); \ ++ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ ++ dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges); \ ++ dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength, \ ++ sec_strength, dir, damping, h); \ ++} ++ ++DEFINE_FILTER(8, 8, 16) ++DEFINE_FILTER(4, 8, 8) ++DEFINE_FILTER(4, 4, 8) ++#endif ++ ++ ++void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) { ++ const unsigned flags = dav1d_get_cpu_flags(); ++ ++ if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; ++ ++#if BITDEPTH == 8 && ARCH_AARCH64 ++ c->dir = dav1d_cdef_find_dir_neon; ++ c->fb[0] = cdef_filter_8x8_neon; ++ c->fb[1] = cdef_filter_4x8_neon; ++ c->fb[2] = cdef_filter_4x4_neon; ++#endif ++} +diff --git third_party/dav1d/src/arm/cpu.c third_party/dav1d/src/arm/cpu.c +index a78e347f0dd2..e2767a04d65e 100644 +--- third_party/dav1d/src/arm/cpu.c ++++ third_party/dav1d/src/arm/cpu.c +@@ -62,7 +62,7 @@ static unsigned parse_proc_cpuinfo(const char *flag) { + // if line is incomplete seek back to avoid splitting the search + // string into two buffers + if (!strchr(line, '\n') && strlen(line) > strlen(flag)) { +- if (fseek(file, -strlen(flag), SEEK_CUR)) ++ if (fseeko(file, -strlen(flag), SEEK_CUR)) + break; + } + } +diff --git third_party/dav1d/src/arm/cpu.h third_party/dav1d/src/arm/cpu.h +index 4788901501f9..8c10a1b6b04b 100644 +--- third_party/dav1d/src/arm/cpu.h ++++ third_party/dav1d/src/arm/cpu.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_ARM_CPU_H__ +-#define __DAV1D_SRC_ARM_CPU_H__ ++#ifndef DAV1D_SRC_ARM_CPU_H ++#define DAV1D_SRC_ARM_CPU_H + + enum CpuFlags { + DAV1D_ARM_CPU_FLAG_NEON = 1 << 0, +@@ -34,4 +34,4 @@ enum CpuFlags { + + unsigned dav1d_get_cpu_flags_arm(void); + +-#endif /* __DAV1D_SRC_ARM_CPU_H__ */ ++#endif /* DAV1D_SRC_ARM_CPU_H */ +diff --git third_party/dav1d/src/arm/looprestoration_init_tmpl.c third_party/dav1d/src/arm/looprestoration_init_tmpl.c +index e4f27d37bfc1..baaad3c0c46f 100644 +--- third_party/dav1d/src/arm/looprestoration_init_tmpl.c ++++ third_party/dav1d/src/arm/looprestoration_init_tmpl.c +@@ -29,10 +29,8 @@ + #include "src/looprestoration.h" + + #include "common/attributes.h" +-#include "common/intops.h" +-#include "src/tables.h" + +-#if BITDEPTH == 8 && ARCH_AARCH64 ++#if BITDEPTH == 8 + // This calculates things slightly differently than the reference C version. + // This version calculates roughly this: + // int16_t sum = 0; +@@ -66,7 +64,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride, + const int w, const int h, const int16_t fh[7], + const int16_t fv[7], const enum LrEdgeFlags edges) + { +- ALIGN_STK_32(int16_t, mid, 68 * 384,); ++ ALIGN_STK_16(int16_t, mid, 68 * 384,); + int mid_stride = (w + 7) & ~7; + + // Horizontal filter +@@ -100,7 +98,7 @@ void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext * + + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + +-#if BITDEPTH == 8 && ARCH_AARCH64 ++#if BITDEPTH == 8 + c->wiener = wiener_filter_neon; + #endif + } +diff --git third_party/dav1d/src/arm/mc_init_tmpl.c third_party/dav1d/src/arm/mc_init_tmpl.c +index 0e348540b937..0e9faca97fa9 100644 +--- third_party/dav1d/src/arm/mc_init_tmpl.c ++++ third_party/dav1d/src/arm/mc_init_tmpl.c +@@ -56,6 +56,9 @@ decl_avg_fn(dav1d_avg_8bpc_neon); + decl_w_avg_fn(dav1d_w_avg_8bpc_neon); + decl_mask_fn(dav1d_mask_8bpc_neon); + ++decl_warp8x8_fn(dav1d_warp_affine_8x8_8bpc_neon); ++decl_warp8x8t_fn(dav1d_warp_affine_8x8t_8bpc_neon); ++ + void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { + #define init_mc_fn(type, name, suffix) \ + c->mc[type] = dav1d_put_##name##_8bpc_##suffix +@@ -66,7 +69,6 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { + if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + + #if BITDEPTH == 8 +-#if ARCH_AARCH64 + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); +@@ -88,10 +90,13 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); + init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); +-#endif + + c->avg = dav1d_avg_8bpc_neon; + c->w_avg = dav1d_w_avg_8bpc_neon; + c->mask = dav1d_mask_8bpc_neon; ++#if ARCH_AARCH64 ++ c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon; ++ c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon; ++#endif + #endif + } +diff --git third_party/dav1d/src/cdef.h third_party/dav1d/src/cdef.h +index 4e7f6bf5daa6..238d9810ed90 100644 +--- third_party/dav1d/src/cdef.h ++++ third_party/dav1d/src/cdef.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_CDEF_H__ +-#define __DAV1D_SRC_CDEF_H__ ++#ifndef DAV1D_SRC_CDEF_H ++#define DAV1D_SRC_CDEF_H + + #include + #include +@@ -34,10 +34,10 @@ + #include "common/bitdepth.h" + + enum CdefEdgeFlags { +- HAVE_LEFT = 1 << 0, +- HAVE_RIGHT = 1 << 1, +- HAVE_TOP = 1 << 2, +- HAVE_BOTTOM = 1 << 3, ++ CDEF_HAVE_LEFT = 1 << 0, ++ CDEF_HAVE_RIGHT = 1 << 1, ++ CDEF_HAVE_TOP = 1 << 2, ++ CDEF_HAVE_BOTTOM = 1 << 3, + }; + + #ifdef BITDEPTH +@@ -66,6 +66,7 @@ typedef struct Dav1dCdefDSPContext { + } Dav1dCdefDSPContext; + + bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c); ++bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c); + bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c); + +-#endif /* __DAV1D_SRC_CDEF_H__ */ ++#endif /* DAV1D_SRC_CDEF_H */ +diff --git third_party/dav1d/src/cdef_apply.h third_party/dav1d/src/cdef_apply.h +index 912812d71280..ffdffba055ba 100644 +--- third_party/dav1d/src/cdef_apply.h ++++ third_party/dav1d/src/cdef_apply.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_CDEF_APPLY_H__ +-#define __DAV1D_SRC_CDEF_APPLY_H__ ++#ifndef DAV1D_SRC_CDEF_APPLY_H ++#define DAV1D_SRC_CDEF_APPLY_H + + #include "common/bitdepth.h" + +@@ -35,4 +35,4 @@ + void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *f, pixel *const p[3], + const Av1Filter *lflvl, int by_start, int by_end); + +-#endif /* __DAV1D_SRC_CDEF_APPLY_H__ */ ++#endif /* DAV1D_SRC_CDEF_APPLY_H */ +diff --git third_party/dav1d/src/cdef_apply_tmpl.c third_party/dav1d/src/cdef_apply_tmpl.c +index 63f5b1ca6c11..7fe269e758a4 100644 +--- third_party/dav1d/src/cdef_apply_tmpl.c ++++ third_party/dav1d/src/cdef_apply_tmpl.c +@@ -58,7 +58,8 @@ static void backup2x8(pixel dst[3][8][2], + const ptrdiff_t src_stride[2], int x_off, + const enum Dav1dPixelLayout layout) + { +- for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0])) ++ ptrdiff_t y_off = 0; ++ for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0])) + pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2); + + if (layout == DAV1D_PIXEL_LAYOUT_I400) return; +@@ -66,7 +67,8 @@ static void backup2x8(pixel dst[3][8][2], + const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; + + x_off >>= ss_hor; +- for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) { ++ y_off = 0; ++ for (int y = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) { + pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2); + pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2); + } +@@ -85,7 +87,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, + { + const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8; + const Dav1dDSPContext *const dsp = f->dsp; +- enum CdefEdgeFlags edges = HAVE_BOTTOM | (by_start > 0 ? HAVE_TOP : 0); ++ enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0); + pixel *ptrs[3] = { p[0], p[1], p[2] }; + const int sbsz = 16; + const int sb64w = f->sb128w << 1; +@@ -101,11 +103,11 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, + // the backup of pre-filter data is empty, and the restore is therefore + // unnecessary as well. + +- for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) { ++ for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) { + const int tf = f->lf.top_pre_cdef_toggle; +- if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM; ++ if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM; + +- if (edges & HAVE_BOTTOM) { ++ if (edges & CDEF_HAVE_BOTTOM) { + // backup pre-filter data for next iteration + backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.stride, + 8, f->bw * 4, layout); +@@ -113,9 +115,9 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, + + pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */]; + pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] }; +- edges &= ~HAVE_LEFT; +- edges |= HAVE_RIGHT; +- for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= HAVE_LEFT) { ++ edges &= ~CDEF_HAVE_LEFT; ++ edges |= CDEF_HAVE_RIGHT; ++ for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) { + const int sb128x = sbx >>1; + const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1); + const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx]; +@@ -131,9 +133,9 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, + const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx]; + pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] }; + for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw); +- bx += 2, edges |= HAVE_LEFT) ++ bx += 2, edges |= CDEF_HAVE_LEFT) + { +- if (bx + 2 >= f->bw) edges &= ~HAVE_RIGHT; ++ if (bx + 2 >= f->bw) edges &= ~CDEF_HAVE_RIGHT; + + // check if this 8x8 block had any coded coefficients; if not, + // go to the next block +@@ -146,12 +148,12 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, + goto next_b; + } + +- if (last_skip && edges & HAVE_LEFT) { ++ if (last_skip && edges & CDEF_HAVE_LEFT) { + // we didn't backup the prefilter data because it wasn't + // there, so do it here instead + backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout); + } +- if (edges & HAVE_RIGHT) { ++ if (edges & CDEF_HAVE_RIGHT) { + // backup pre-filter data for next iteration + backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout); + } +diff --git third_party/dav1d/src/cdef_tmpl.c third_party/dav1d/src/cdef_tmpl.c +index ce6493fdd20c..d7fde6abdf28 100644 +--- third_party/dav1d/src/cdef_tmpl.c ++++ third_party/dav1d/src/cdef_tmpl.c +@@ -61,19 +61,19 @@ static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride, + { + // fill extended input buffer + int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2; +- if (!(edges & HAVE_TOP)) { ++ if (!(edges & CDEF_HAVE_TOP)) { + fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2); + y_start = 0; + } +- if (!(edges & HAVE_BOTTOM)) { ++ if (!(edges & CDEF_HAVE_BOTTOM)) { + fill(tmp + h * tmp_stride - 2, tmp_stride, w + 4, 2); + y_end -= 2; + } +- if (!(edges & HAVE_LEFT)) { ++ if (!(edges & CDEF_HAVE_LEFT)) { + fill(tmp + y_start * tmp_stride - 2, tmp_stride, 2, y_end - y_start); + x_start = 0; + } +- if (!(edges & HAVE_RIGHT)) { ++ if (!(edges & CDEF_HAVE_RIGHT)) { + fill(tmp + y_start * tmp_stride + w, tmp_stride, 2, y_end - y_start); + x_end -= 2; + } +@@ -110,14 +110,12 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride, + { 1 * 12 + 0, 2 * 12 + 0 }, + { 1 * 12 + 0, 2 * 12 - 1 } + }; +- static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } }; +- static const uint8_t sec_taps[2] = { 2, 1 }; + const ptrdiff_t tmp_stride = 12; + assert((w == 4 || w == 8) && (h == 4 || h == 8)); + uint16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4) + uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; + const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; +- const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> bitdepth_min_8) & 1]; ++ const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1); + + padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges); + +@@ -127,12 +125,15 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride, + int sum = 0; + const int px = dst[x]; + int max = px, min = px; ++ int pri_tap_k = pri_tap; + for (int k = 0; k < 2; k++) { + const int off1 = cdef_directions[dir][k]; + const int p0 = tmp[x + off1]; + const int p1 = tmp[x - off1]; +- sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping); +- sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping); ++ sum += pri_tap_k * constrain(p0 - px, pri_strength, damping); ++ sum += pri_tap_k * constrain(p1 - px, pri_strength, damping); ++ // if pri_tap_k == 4 then it becomes 2 else it remains 3 ++ pri_tap_k -= (pri_tap_k << 1) - 6; + if (p0 != INT16_MAX) max = imax(p0, max); + if (p1 != INT16_MAX) max = imax(p1, max); + min = imin(p0, min); +@@ -151,10 +152,12 @@ cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride, + min = imin(s1, min); + min = imin(s2, min); + min = imin(s3, min); +- sum += sec_taps[k] * constrain(s0 - px, sec_strength, damping); +- sum += sec_taps[k] * constrain(s1 - px, sec_strength, damping); +- sum += sec_taps[k] * constrain(s2 - px, sec_strength, damping); +- sum += sec_taps[k] * constrain(s3 - px, sec_strength, damping); ++ // sec_tap starts at 2 and becomes 1 ++ const int sec_tap = 2 - k; ++ sum += sec_tap * constrain(s0 - px, sec_strength, damping); ++ sum += sec_tap * constrain(s1 - px, sec_strength, damping); ++ sum += sec_tap * constrain(s2 - px, sec_strength, damping); ++ sum += sec_tap * constrain(s3 - px, sec_strength, damping); + } + dst[x] = iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max); + } +@@ -257,7 +260,11 @@ void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) { + c->fb[1] = cdef_filter_block_4x8_c; + c->fb[2] = cdef_filter_block_4x4_c; + +-#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8 ++#if HAVE_ASM ++#if ARCH_AARCH64 || ARCH_ARM ++ bitfn(dav1d_cdef_dsp_init_arm)(c); ++#elif ARCH_X86 + bitfn(dav1d_cdef_dsp_init_x86)(c); + #endif ++#endif + } +diff --git third_party/dav1d/src/cdf.h third_party/dav1d/src/cdf.h +index 40fae019afef..6d957711e6e3 100644 +--- third_party/dav1d/src/cdf.h ++++ third_party/dav1d/src/cdf.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __AV1_CDF_H__ +-#define __AV1_CDF_H__ ++#ifndef DAV1D_SRC_CDF_H ++#define DAV1D_SRC_CDF_H + + #include + +@@ -148,4 +148,4 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *hdr, CdfContext *dst, + void dav1d_cdf_thread_wait(CdfThreadContext *cdf); + void dav1d_cdf_thread_signal(CdfThreadContext *cdf); + +-#endif /* __AV1_CDF_H__ */ ++#endif /* DAV1D_SRC_CDF_H */ +diff --git third_party/dav1d/src/cpu.h third_party/dav1d/src/cpu.h +index c36c18e364f7..89a64a0db716 100644 +--- third_party/dav1d/src/cpu.h ++++ third_party/dav1d/src/cpu.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_CPU_H__ +-#define __DAV1D_SRC_CPU_H__ ++#ifndef DAV1D_SRC_CPU_H ++#define DAV1D_SRC_CPU_H + + #include "config.h" + +@@ -41,4 +41,4 @@ + unsigned dav1d_get_cpu_flags(void); + DAV1D_API void dav1d_set_cpu_flags_mask(const unsigned mask); + +-#endif /* __DAV1D_SRC_CPU_H__ */ ++#endif /* DAV1D_SRC_CPU_H */ +diff --git third_party/dav1d/src/ctx.h third_party/dav1d/src/ctx.h +index f6818bf9f2f4..d0e1f310ae2d 100644 +--- third_party/dav1d/src/ctx.h ++++ third_party/dav1d/src/ctx.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_CTX_H__ +-#define __DAV1D_SRC_CTX_H__ ++#ifndef DAV1D_SRC_CTX_H ++#define DAV1D_SRC_CTX_H + + #include + +@@ -88,4 +88,4 @@ union alias8 { uint8_t u8; } ATTR_ALIAS; + default: default_memset(dir, diridx, off, var); break; \ + } + +-#endif /* __DAV1D_SRC_CTX_H__ */ ++#endif /* DAV1D_SRC_CTX_H */ +diff --git third_party/dav1d/src/data.c third_party/dav1d/src/data.c +index 726f7de0d99b..d930b599dce2 100644 +--- third_party/dav1d/src/data.c ++++ third_party/dav1d/src/data.c +@@ -47,11 +47,7 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) { + if (!buf->ref) return NULL; + buf->data = buf->ref->const_data; + buf->sz = buf->m.size = sz; +- buf->m.timestamp = INT64_MIN; +- buf->m.duration = 0; +- buf->m.offset = -1; +- buf->m.user_data.data = NULL; +- buf->m.user_data.ref = NULL; ++ dav1d_data_props_set_defaults(&buf->m); + + return buf->ref->data; + } +@@ -70,11 +66,7 @@ int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr, + if (!buf->ref) return -ENOMEM; + buf->data = ptr; + buf->sz = buf->m.size = sz; +- buf->m.timestamp = INT64_MIN; +- buf->m.duration = 0; +- buf->m.offset = -1; +- buf->m.user_data.data = NULL; +- buf->m.user_data.ref = NULL; ++ dav1d_data_props_set_defaults(&buf->m); + + return 0; + } +@@ -132,6 +124,16 @@ void dav1d_data_props_copy(Dav1dDataProps *const dst, + if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref); + } + ++void dav1d_data_props_set_defaults(Dav1dDataProps *const props) { ++ assert(props != NULL); ++ ++ props->timestamp = INT64_MIN; ++ props->duration = 0; ++ props->offset = -1; ++ props->user_data.data = NULL; ++ props->user_data.ref = NULL; ++} ++ + void dav1d_data_unref_internal(Dav1dData *const buf) { + validate_input(buf != NULL); + +diff --git third_party/dav1d/src/data.h third_party/dav1d/src/data.h +index 65f24d6f12d6..6ebb551076ba 100644 +--- third_party/dav1d/src/data.h ++++ third_party/dav1d/src/data.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_DATA_H__ +-#define __DAV1D_SRC_DATA_H__ ++#ifndef DAV1D_SRC_DATA_H ++#define DAV1D_SRC_DATA_H + + #include "dav1d/data.h" + +@@ -43,6 +43,8 @@ void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src); + */ + void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src); + ++void dav1d_data_props_set_defaults(Dav1dDataProps *props); ++ + uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz); + int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz, + void (*free_callback)(const uint8_t *data, +@@ -55,4 +57,4 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *buf, + void *cookie); + void dav1d_data_unref_internal(Dav1dData *buf); + +-#endif /* __DAV1D_SRC_DATA_H__ */ ++#endif /* DAV1D_SRC_DATA_H */ +diff --git third_party/dav1d/src/dav1d.rc.in third_party/dav1d/src/dav1d.rc.in +index cdefd5af3e75..ad6aab481de4 100644 +--- third_party/dav1d/src/dav1d.rc.in ++++ third_party/dav1d/src/dav1d.rc.in +@@ -1,13 +1,15 @@ +-#define VERSION_NUMBER @VERSION_MAJOR@,@VERSION_MINOR@,@VERSION_REVISION@,@VERSION_EXTRA@ +-#define VERSION_NUMBER_STR "@VERSION_MAJOR@.@VERSION_MINOR@.@VERSION_REVISION@.@VERSION_EXTRA@" ++#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0 ++#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@" ++#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0 ++#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@" + + #include + + 1 VERSIONINFO + FILETYPE VFT_DLL + FILEOS VOS_NT_WINDOWS32 +-PRODUCTVERSION VERSION_NUMBER +-FILEVERSION VERSION_NUMBER ++PRODUCTVERSION PROJECT_VERSION_NUMBER ++FILEVERSION API_VERSION_NUMBER + BEGIN + BLOCK "StringFileInfo" + BEGIN +@@ -15,9 +17,9 @@ BEGIN + BEGIN + VALUE "CompanyName", "VideoLAN" + VALUE "ProductName", "dav1d" +- VALUE "ProductVersion", VERSION_NUMBER_STR +- VALUE "FileVersion", VERSION_NUMBER_STR +- VALUE "FileDescription", "dav1d AV1 decoder" ++ VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR ++ VALUE "FileVersion", API_VERSION_NUMBER_STR ++ VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder" + VALUE "InternalName", "dav1d" + VALUE "OriginalFilename", "libdav1d.dll" + VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors" +diff --git third_party/dav1d/src/decode.c third_party/dav1d/src/decode.c +index 3cafdc73da4e..25deffc9de9b 100644 +--- third_party/dav1d/src/decode.c ++++ third_party/dav1d/src/decode.c +@@ -42,6 +42,7 @@ + #include "src/decode.h" + #include "src/dequant_tables.h" + #include "src/env.h" ++#include "src/log.h" + #include "src/qm.h" + #include "src/recon.h" + #include "src/ref.h" +@@ -78,15 +79,18 @@ static int read_mv_component_diff(Dav1dTileContext *const t, + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; + const int have_hp = f->frame_hdr->hp; +- const int sign = msac_decode_bool_adapt(&ts->msac, mv_comp->sign); +- const int cl = msac_decode_symbol_adapt(&ts->msac, mv_comp->classes, 11); ++ const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign); ++ const int cl = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ mv_comp->classes, 11); + int up, fp, hp; + + if (!cl) { +- up = msac_decode_bool_adapt(&ts->msac, mv_comp->class0); ++ up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0); + if (have_fp) { +- fp = msac_decode_symbol_adapt(&ts->msac, mv_comp->class0_fp[up], 4); +- hp = have_hp ? msac_decode_bool_adapt(&ts->msac, mv_comp->class0_hp) : 1; ++ fp = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ mv_comp->class0_fp[up], 4); ++ hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, ++ mv_comp->class0_hp) : 1; + } else { + fp = 3; + hp = 1; +@@ -94,10 +98,13 @@ static int read_mv_component_diff(Dav1dTileContext *const t, + } else { + up = 1 << cl; + for (int n = 0; n < cl; n++) +- up |= msac_decode_bool_adapt(&ts->msac, mv_comp->classN[n]) << n; ++ up |= dav1d_msac_decode_bool_adapt(&ts->msac, ++ mv_comp->classN[n]) << n; + if (have_fp) { +- fp = msac_decode_symbol_adapt(&ts->msac, mv_comp->classN_fp, 4); +- hp = have_hp ? msac_decode_bool_adapt(&ts->msac, mv_comp->classN_hp) : 1; ++ fp = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ mv_comp->classN_fp, 4); ++ hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, ++ mv_comp->classN_hp) : 1; + } else { + fp = 3; + hp = 1; +@@ -112,7 +119,9 @@ static int read_mv_component_diff(Dav1dTileContext *const t, + static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv, + CdfMvContext *const mv_cdf, const int have_fp) + { +- switch (msac_decode_symbol_adapt(&t->ts->msac, t->ts->cdf.mv.joint, N_MV_JOINTS)) { ++ switch (dav1d_msac_decode_symbol_adapt(&t->ts->msac, t->ts->cdf.mv.joint, ++ N_MV_JOINTS)) ++ { + case MV_JOINT_HV: + ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp); + ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp); +@@ -144,7 +153,8 @@ static void read_tx_tree(Dav1dTileContext *const t, + const int a = t->a->tx[bx4] < txw; + const int l = t->l.tx[by4] < txh; + +- is_split = msac_decode_bool_adapt(&t->ts->msac, t->ts->cdf.m.txpart[cat][a + l]); ++ is_split = dav1d_msac_decode_bool_adapt(&t->ts->msac, ++ t->ts->cdf.m.txpart[cat][a + l]); + if (is_split) + masks[depth] |= 1 << (y_off * 4 + x_off); + } else { +@@ -302,7 +312,7 @@ static void derive_warpmv(const Dav1dTileContext *const t, + if ((unsigned) masks[0] == 1 && !(masks[1] >> 32)) { + const int off = t->bx & (bs(&r[-b4_stride])[0] - 1); + add_sample(-off, 0, 1, -1, &r[-b4_stride]); +- } else for (unsigned off = 0, xmask = masks[0]; np < 8 && xmask;) { // top ++ } else for (unsigned off = 0, xmask = (uint32_t) masks[0]; np < 8 && xmask;) { // top + const int tz = ctz(xmask); + off += tz; + xmask >>= tz; +@@ -312,7 +322,7 @@ static void derive_warpmv(const Dav1dTileContext *const t, + if (np < 8 && masks[1] == 1) { + const int off = t->by & (bs(&r[-1])[1] - 1); + add_sample(0, -off, -1, 1, &r[-1 - off * b4_stride]); +- } else for (unsigned off = 0, ymask = masks[1]; np < 8 && ymask;) { // left ++ } else for (unsigned off = 0, ymask = (uint32_t) masks[1]; np < 8 && ymask;) { // left + const int tz = ctz(ymask); + off += tz; + ymask >>= tz; +@@ -330,8 +340,8 @@ static void derive_warpmv(const Dav1dTileContext *const t, + int mvd[8], ret = 0; + const int thresh = 4 * iclip(imax(bw4, bh4), 4, 28); + for (int i = 0; i < np; i++) { +- mvd[i] = labs(pts[i][1][0] - pts[i][0][0] - mv.x) + +- labs(pts[i][1][1] - pts[i][0][1] - mv.y); ++ mvd[i] = abs(pts[i][1][0] - pts[i][0][0] - mv.x) + ++ abs(pts[i][1][1] - pts[i][0][1] - mv.y); + if (mvd[i] > thresh) + mvd[i] = -1; + else +@@ -369,8 +379,8 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b, + { + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; +- const int pal_sz = b->pal_sz[pl] = 2 + msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.pal_sz[pl][sz_ctx], 7); ++ const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2; + uint16_t cache[16], used_cache[8]; + int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4]; + int n_cache = 0; +@@ -413,7 +423,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b, + // find reused cache entries + int i = 0; + for (int n = 0; n < n_cache && i < pal_sz; n++) +- if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) ++ if (dav1d_msac_decode_bool_equi(&ts->msac)) + used_cache[i++] = cache[n]; + const int n_used_cache = i; + +@@ -422,14 +432,14 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b, + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))][pl] : t->pal[pl]; + if (i < pal_sz) { +- int prev = pal[i++] = msac_decode_bools(&ts->msac, f->cur.p.bpc); ++ int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc); + + if (i < pal_sz) { +- int bits = f->cur.p.bpc - 3 + msac_decode_bools(&ts->msac, 2); ++ int bits = f->cur.p.bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2); + const int max = (1 << f->cur.p.bpc) - 1; + + do { +- const int delta = msac_decode_bools(&ts->msac, bits); ++ const int delta = dav1d_msac_decode_bools(&ts->msac, bits); + prev = pal[i++] = imin(prev + delta + !pl, max); + if (prev + !pl >= max) { + for (; i < pal_sz; i++) +@@ -477,18 +487,19 @@ static void read_pal_uv(Dav1dTileContext *const t, Av1Block *const b, + uint16_t *const pal = f->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2]; +- if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) { +- const int bits = f->cur.p.bpc - 4 + msac_decode_bools(&ts->msac, 2); +- int prev = pal[0] = msac_decode_bools(&ts->msac, f->cur.p.bpc); ++ if (dav1d_msac_decode_bool_equi(&ts->msac)) { ++ const int bits = f->cur.p.bpc - 4 + ++ dav1d_msac_decode_bools(&ts->msac, 2); ++ int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc); + const int max = (1 << f->cur.p.bpc) - 1; + for (int i = 1; i < b->pal_sz[1]; i++) { +- int delta = msac_decode_bools(&ts->msac, bits); +- if (delta && msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta = -delta; ++ int delta = dav1d_msac_decode_bools(&ts->msac, bits); ++ if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta; + prev = pal[i] = (prev + delta) & max; + } + } else { + for (int i = 0; i < b->pal_sz[1]; i++) +- pal[i] = msac_decode_bools(&ts->msac, f->cur.p.bpc); ++ pal[i] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc); + } + if (DEBUG_BLOCK_INFO) { + printf("Post-pal[pl=2]: r=%d ", ts->msac.rng); +@@ -574,7 +585,7 @@ static void read_pal_indices(Dav1dTileContext *const t, + { + Dav1dTileState *const ts = t->ts; + const ptrdiff_t stride = bw4 * 4; +- pal_idx[0] = msac_decode_uniform(&ts->msac, b->pal_sz[pl]); ++ pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]); + uint16_t (*const color_map_cdf)[8 + 1] = + ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2]; + for (int i = 1; i < 4 * (w4 + h4) - 1; i++) { +@@ -584,9 +595,8 @@ static void read_pal_indices(Dav1dTileContext *const t, + const int last = imax(0, i - h4 * 4 + 1); + order_palette(pal_idx, stride, i, first, last, order, ctx); + for (int j = first, m = 0; j >= last; j--, m++) { +- const int color_idx = +- msac_decode_symbol_adapt(&ts->msac, color_map_cdf[ctx[m]], +- b->pal_sz[pl]); ++ const int color_idx = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ color_map_cdf[ctx[m]], b->pal_sz[pl]); + pal_idx[(i - j) * stride + j] = order[m][color_idx]; + } + } +@@ -781,9 +791,9 @@ static int decode_b(Dav1dTileContext *const t, + seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id]; + } else if (f->frame_hdr->segmentation.seg_data.preskip) { + if (f->frame_hdr->segmentation.temporal && +- (seg_pred = msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.seg_pred[t->a->seg_pred[bx4] + +- t->l.seg_pred[by4]]))) ++ (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.seg_pred[t->a->seg_pred[bx4] + ++ t->l.seg_pred[by4]]))) + { + // temporal predicted seg_id + if (f->prev_segmap) { +@@ -801,9 +811,9 @@ static int decode_b(Dav1dTileContext *const t, + const unsigned pred_seg_id = + get_cur_frame_segid(t->by, t->bx, have_top, have_left, + &seg_ctx, f->cur_segmap, f->b4_stride); +- const unsigned diff = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.seg_id[seg_ctx], +- DAV1D_MAX_SEGMENTS); ++ const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.seg_id[seg_ctx], ++ DAV1D_MAX_SEGMENTS); + const unsigned last_active_seg_id = + f->frame_hdr->segmentation.seg_data.last_active_segid; + b->seg_id = neg_deinterleave(diff, pred_seg_id, +@@ -827,8 +837,8 @@ static int decode_b(Dav1dTileContext *const t, + f->frame_hdr->skip_mode_enabled && imin(bw4, bh4) > 1) + { + const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4]; +- b->skip_mode = msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.skip_mode[smctx]); ++ b->skip_mode = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.skip_mode[smctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng); + } else { +@@ -840,7 +850,7 @@ static int decode_b(Dav1dTileContext *const t, + b->skip = 1; + } else { + const int sctx = t->a->skip[bx4] + t->l.skip[by4]; +- b->skip = msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]); ++ b->skip = dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng); + } +@@ -851,9 +861,9 @@ static int decode_b(Dav1dTileContext *const t, + !f->frame_hdr->segmentation.seg_data.preskip) + { + if (!b->skip && f->frame_hdr->segmentation.temporal && +- (seg_pred = msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.seg_pred[t->a->seg_pred[bx4] + +- t->l.seg_pred[by4]]))) ++ (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.seg_pred[t->a->seg_pred[bx4] + ++ t->l.seg_pred[by4]]))) + { + // temporal predicted seg_id + if (f->prev_segmap) { +@@ -873,9 +883,9 @@ static int decode_b(Dav1dTileContext *const t, + if (b->skip) { + b->seg_id = pred_seg_id; + } else { +- const unsigned diff = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.seg_id[seg_ctx], +- DAV1D_MAX_SEGMENTS); ++ const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.seg_id[seg_ctx], ++ DAV1D_MAX_SEGMENTS); + const unsigned last_active_seg_id = + f->frame_hdr->segmentation.seg_data.last_active_segid; + b->seg_id = neg_deinterleave(diff, pred_seg_id, +@@ -897,7 +907,8 @@ static int decode_b(Dav1dTileContext *const t, + const int idx = f->seq_hdr->sb128 ? ((t->bx & 16) >> 4) + + ((t->by & 16) >> 3) : 0; + if (t->cur_sb_cdef_idx_ptr[idx] == -1) { +- const int v = msac_decode_bools(&ts->msac, f->frame_hdr->cdef.n_bits); ++ const int v = dav1d_msac_decode_bools(&ts->msac, ++ f->frame_hdr->cdef.n_bits); + t->cur_sb_cdef_idx_ptr[idx] = v; + if (bw4 > 16) t->cur_sb_cdef_idx_ptr[idx + 1] = v; + if (bh4 > 16) t->cur_sb_cdef_idx_ptr[idx + 2] = v; +@@ -921,13 +932,15 @@ static int decode_b(Dav1dTileContext *const t, + memcpy(prev_delta_lf, ts->last_delta_lf, 4); + + if (have_delta_q) { +- int delta_q = msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.delta_q, 4); ++ int delta_q = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.delta_q, 4); + if (delta_q == 3) { +- const int n_bits = 1 + msac_decode_bools(&ts->msac, 3); +- delta_q = msac_decode_bools(&ts->msac, n_bits) + 1 + (1 << n_bits); ++ const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3); ++ delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) + ++ 1 + (1 << n_bits); + } + if (delta_q) { +- if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta_q = -delta_q; ++ if (dav1d_msac_decode_bool_equi(&ts->msac)) delta_q = -delta_q; + delta_q *= 1 << f->frame_hdr->delta.q.res_log2; + } + ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255); +@@ -940,16 +953,15 @@ static int decode_b(Dav1dTileContext *const t, + f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1; + + for (int i = 0; i < n_lfs; i++) { +- int delta_lf = +- msac_decode_symbol_adapt(&ts->msac, ++ int delta_lf = dav1d_msac_decode_symbol_adapt(&ts->msac, + ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4); + if (delta_lf == 3) { +- const int n_bits = 1 + msac_decode_bools(&ts->msac, 3); +- delta_lf = msac_decode_bools(&ts->msac, n_bits) + ++ const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3); ++ delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) + + 1 + (1 << n_bits); + } + if (delta_lf) { +- if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) ++ if (dav1d_msac_decode_bool_equi(&ts->msac)) + delta_lf = -delta_lf; + delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2; + } +@@ -987,12 +999,13 @@ static int decode_b(Dav1dTileContext *const t, + } else { + const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- b->intra = !msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intra[ictx]); ++ b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.intra[ictx]); + if (DEBUG_BLOCK_INFO) + printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng); + } + } else if (f->frame_hdr->allow_intrabc) { +- b->intra = !msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc); ++ b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc); + if (DEBUG_BLOCK_INFO) + printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng); + } else { +@@ -1005,8 +1018,8 @@ static int decode_b(Dav1dTileContext *const t, + ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] : + ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]] + [dav1d_intra_mode_context[t->l.mode[by4]]]; +- b->y_mode = msac_decode_symbol_adapt(&ts->msac, ymode_cdf, +- N_INTRA_PRED_MODES); ++ b->y_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, ymode_cdf, ++ N_INTRA_PRED_MODES); + if (DEBUG_BLOCK_INFO) + printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng); + +@@ -1015,7 +1028,7 @@ static int decode_b(Dav1dTileContext *const t, + b->y_mode <= VERT_LEFT_PRED) + { + uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED]; +- const int angle = msac_decode_symbol_adapt(&ts->msac, acdf, 7); ++ const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7); + b->y_angle = angle - 3; + } else { + b->y_angle = 0; +@@ -1025,29 +1038,29 @@ static int decode_b(Dav1dTileContext *const t, + const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ? + cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs)); + uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode]; +- b->uv_mode = msac_decode_symbol_adapt(&ts->msac, uvmode_cdf, +- N_UV_INTRA_PRED_MODES - !cfl_allowed); ++ b->uv_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, uvmode_cdf, ++ N_UV_INTRA_PRED_MODES - !cfl_allowed); + if (DEBUG_BLOCK_INFO) + printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng); + + if (b->uv_mode == CFL_PRED) { + #define SIGN(a) (!!(a) + ((a) > 0)) +- const int sign = +- msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.cfl_sign, 8) + 1; ++ const int sign = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.cfl_sign, 8) + 1; + const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3; + assert(sign_u == sign / 3); + if (sign_u) { + const int ctx = (sign_u == 2) * 3 + sign_v; +- b->cfl_alpha[0] = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.cfl_alpha[ctx], 16) + 1; ++ b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.cfl_alpha[ctx], 16) + 1; + if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0]; + } else { + b->cfl_alpha[0] = 0; + } + if (sign_v) { + const int ctx = (sign_v == 2) * 3 + sign_u; +- b->cfl_alpha[1] = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.cfl_alpha[ctx], 16) + 1; ++ b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.cfl_alpha[ctx], 16) + 1; + if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1]; + } else { + b->cfl_alpha[1] = 0; +@@ -1060,7 +1073,7 @@ static int decode_b(Dav1dTileContext *const t, + b->uv_mode <= VERT_LEFT_PRED) + { + uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED]; +- const int angle = msac_decode_symbol_adapt(&ts->msac, acdf, 7); ++ const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7); + b->uv_angle = angle - 3; + } else { + b->uv_angle = 0; +@@ -1074,8 +1087,8 @@ static int decode_b(Dav1dTileContext *const t, + const int sz_ctx = b_dim[2] + b_dim[3] - 2; + if (b->y_mode == DC_PRED) { + const int pal_ctx = (t->a->pal_sz[bx4] > 0) + (t->l.pal_sz[by4] > 0); +- const int use_y_pal = +- msac_decode_bool_adapt(&ts->msac, ts->cdf.m.pal_y[sz_ctx][pal_ctx]); ++ const int use_y_pal = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.pal_y[sz_ctx][pal_ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng); + if (use_y_pal) +@@ -1084,8 +1097,8 @@ static int decode_b(Dav1dTileContext *const t, + + if (has_chroma && b->uv_mode == DC_PRED) { + const int pal_ctx = b->pal_sz[0] > 0; +- const int use_uv_pal = +- msac_decode_bool_adapt(&ts->msac, ts->cdf.m.pal_uv[pal_ctx]); ++ const int use_uv_pal = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.pal_uv[pal_ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng); + if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates +@@ -1096,12 +1109,12 @@ static int decode_b(Dav1dTileContext *const t, + if (b->y_mode == DC_PRED && !b->pal_sz[0] && + imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr->filter_intra) + { +- const int is_filter = msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.use_filter_intra[bs]); ++ const int is_filter = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.use_filter_intra[bs]); + if (is_filter) { + b->y_mode = FILTER_PRED; +- b->y_angle = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.filter_intra, 5); ++ b->y_angle = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.filter_intra, 5); + } + if (DEBUG_BLOCK_INFO) + printf("Post-filterintramode[%d/%d]: r=%d\n", +@@ -1143,8 +1156,8 @@ static int decode_b(Dav1dTileContext *const t, + if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) { + const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4); + uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx]; +- int depth = msac_decode_symbol_adapt(&ts->msac, tx_cdf, +- imin(t_dim->max + 1, 3)); ++ int depth = dav1d_msac_decode_symbol_adapt(&ts->msac, tx_cdf, ++ imin(t_dim->max + 1, 3)); + + while (depth--) { + b->tx = t_dim->sub; +@@ -1356,7 +1369,8 @@ static int decode_b(Dav1dTileContext *const t, + { + const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- is_comp = msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp[ctx]); ++ is_comp = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp[ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng); + } else { +@@ -1391,43 +1405,45 @@ static int decode_b(Dav1dTileContext *const t, + } else if (is_comp) { + const int dir_ctx = get_comp_dir_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp_dir[dir_ctx])) { ++ if (dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp_dir[dir_ctx])) ++ { + // bidir - first reference (fw) + const int ctx1 = av1_get_fwd_ref_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- if (msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.comp_fwd_ref[0][ctx1])) ++ if (dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp_fwd_ref[0][ctx1])) + { + const int ctx2 = av1_get_fwd_ref_2_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- b->ref[0] = 2 + msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.comp_fwd_ref[2][ctx2]); ++ b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp_fwd_ref[2][ctx2]); + } else { + const int ctx2 = av1_get_fwd_ref_1_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- b->ref[0] = msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.comp_fwd_ref[1][ctx2]); ++ b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp_fwd_ref[1][ctx2]); + } + + // second reference (bw) + const int ctx3 = av1_get_bwd_ref_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- if (msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.comp_bwd_ref[0][ctx3])) ++ if (dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp_bwd_ref[0][ctx3])) + { + b->ref[1] = 6; + } else { + const int ctx4 = av1_get_bwd_ref_1_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- b->ref[1] = 4 + msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.comp_bwd_ref[1][ctx4]); ++ b->ref[1] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp_bwd_ref[1][ctx4]); + } + } else { + // unidir + const int uctx_p = av1_get_uni_p_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- if (msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.comp_uni_ref[0][uctx_p])) ++ if (dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp_uni_ref[0][uctx_p])) + { + b->ref[0] = 4; + b->ref[1] = 6; +@@ -1435,13 +1451,13 @@ static int decode_b(Dav1dTileContext *const t, + const int uctx_p1 = av1_get_uni_p1_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); + b->ref[0] = 0; +- b->ref[1] = 1 + msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.comp_uni_ref[1][uctx_p1]); ++ b->ref[1] = 1 + dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp_uni_ref[1][uctx_p1]); + if (b->ref[1] == 2) { + const int uctx_p2 = av1_get_uni_p2_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- b->ref[1] += msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.comp_uni_ref[2][uctx_p2]); ++ b->ref[1] += dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.comp_uni_ref[2][uctx_p2]); + } + } + } +@@ -1458,9 +1474,9 @@ static int decode_b(Dav1dTileContext *const t, + ts->tiling.col_end, ts->tiling.row_start, + ts->tiling.row_end, f->libaom_cm); + +- b->inter_mode = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.comp_inter_mode[ctx], +- N_COMP_INTER_PRED_MODES); ++ b->inter_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.comp_inter_mode[ctx], ++ N_COMP_INTER_PRED_MODES); + if (DEBUG_BLOCK_INFO) + printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n", + b->inter_mode, ctx, n_mvs, ts->msac.rng); +@@ -1470,12 +1486,12 @@ static int decode_b(Dav1dTileContext *const t, + if (b->inter_mode == NEWMV_NEWMV) { + if (n_mvs > 1) { + const int drl_ctx_v1 = get_drl_context(mvstack, 0); +- b->drl_idx += msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.drl_bit[drl_ctx_v1]); ++ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.drl_bit[drl_ctx_v1]); + if (b->drl_idx == 1 && n_mvs > 2) { + const int drl_ctx_v2 = get_drl_context(mvstack, 1); +- b->drl_idx += msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.drl_bit[drl_ctx_v2]); ++ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.drl_bit[drl_ctx_v2]); + } + if (DEBUG_BLOCK_INFO) + printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n", +@@ -1485,12 +1501,12 @@ static int decode_b(Dav1dTileContext *const t, + b->drl_idx = 1; + if (n_mvs > 2) { + const int drl_ctx_v2 = get_drl_context(mvstack, 1); +- b->drl_idx += msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.drl_bit[drl_ctx_v2]); ++ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.drl_bit[drl_ctx_v2]); + if (b->drl_idx == 2 && n_mvs > 3) { + const int drl_ctx_v3 = get_drl_context(mvstack, 2); +- b->drl_idx += msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.drl_bit[drl_ctx_v3]); ++ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.drl_bit[drl_ctx_v3]); + } + if (DEBUG_BLOCK_INFO) + printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n", +@@ -1533,8 +1549,8 @@ static int decode_b(Dav1dTileContext *const t, + if (f->seq_hdr->masked_compound) { + const int mask_ctx = get_mask_comp_ctx(t->a, &t->l, by4, bx4); + +- is_segwedge = msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.mask_comp[mask_ctx]); ++ is_segwedge = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.mask_comp[mask_ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-segwedge_vs_jntavg[%d,ctx=%d]: r=%d\n", + is_segwedge, mask_ctx, ts->msac.rng); +@@ -1549,8 +1565,8 @@ static int decode_b(Dav1dTileContext *const t, + f->refp[b->ref[1]].p.frame_hdr->frame_offset, + t->a, &t->l, by4, bx4); + b->comp_type = COMP_INTER_WEIGHTED_AVG + +- msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.jnt_comp[jnt_ctx]); ++ dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.jnt_comp[jnt_ctx]); + if (DEBUG_BLOCK_INFO) + printf("Post-jnt_comp[%d,ctx=%d[ac:%d,ar:%d,lc:%d,lr:%d]]: r=%d\n", + b->comp_type == COMP_INTER_AVG, +@@ -1564,15 +1580,15 @@ static int decode_b(Dav1dTileContext *const t, + if (wedge_allowed_mask & (1 << bs)) { + const int ctx = dav1d_wedge_ctx_lut[bs]; + b->comp_type = COMP_INTER_WEDGE - +- msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.wedge_comp[ctx]); ++ dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.wedge_comp[ctx]); + if (b->comp_type == COMP_INTER_WEDGE) +- b->wedge_idx = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.wedge_idx[ctx], 16); ++ b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.wedge_idx[ctx], 16); + } else { + b->comp_type = COMP_INTER_SEG; + } +- b->mask_sign = msac_decode_bool(&ts->msac, EC_BOOL_EPROB); ++ b->mask_sign = dav1d_msac_decode_bool_equi(&ts->msac); + if (DEBUG_BLOCK_INFO) + printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n", + b->comp_type == COMP_INTER_WEDGE, +@@ -1589,30 +1605,36 @@ static int decode_b(Dav1dTileContext *const t, + } else { + const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[0][ctx1])) { ++ if (dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.ref[0][ctx1])) ++ { + const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[1][ctx2])) { ++ if (dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.ref[1][ctx2])) ++ { + b->ref[0] = 6; + } else { + const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- b->ref[0] = 4 + msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.ref[5][ctx3]); ++ b->ref[0] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.ref[5][ctx3]); + } + } else { + const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[2][ctx2])) { ++ if (dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.ref[2][ctx2])) ++ { + const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- b->ref[0] = 2 + msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.ref[4][ctx3]); ++ b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.ref[4][ctx3]); + } else { + const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4, + have_top, have_left); +- b->ref[0] = msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.ref[3][ctx3]); ++ b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.ref[3][ctx3]); + } + } + if (DEBUG_BLOCK_INFO) +@@ -1631,11 +1653,12 @@ static int decode_b(Dav1dTileContext *const t, + + // mode parsing and mv derivation from ref_mvs + if ((seg && (seg->skip || seg->globalmv)) || +- msac_decode_bool_adapt(&ts->msac, ts->cdf.m.newmv_mode[ctx & 7])) ++ dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.newmv_mode[ctx & 7])) + { + if ((seg && (seg->skip || seg->globalmv)) || +- !msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.globalmv_mode[(ctx >> 3) & 1])) ++ !dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.globalmv_mode[(ctx >> 3) & 1])) + { + b->inter_mode = GLOBALMV; + b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[0]], +@@ -1645,20 +1668,20 @@ static int decode_b(Dav1dTileContext *const t, + f->frame_hdr->gmv[b->ref[0]].type == DAV1D_WM_TYPE_TRANSLATION; + } else { + has_subpel_filter = 1; +- if (msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.refmv_mode[(ctx >> 4) & 15])) ++ if (dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.refmv_mode[(ctx >> 4) & 15])) + { + b->inter_mode = NEARMV; + b->drl_idx = 1; + if (n_mvs > 2) { + const int drl_ctx_v2 = get_drl_context(mvstack, 1); +- b->drl_idx += msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.drl_bit[drl_ctx_v2]); ++ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.drl_bit[drl_ctx_v2]); + if (b->drl_idx == 2 && n_mvs > 3) { + const int drl_ctx_v3 = + get_drl_context(mvstack, 2); +- b->drl_idx += msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.drl_bit[drl_ctx_v3]); ++ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.drl_bit[drl_ctx_v3]); + } + } + } else { +@@ -1683,12 +1706,12 @@ static int decode_b(Dav1dTileContext *const t, + b->drl_idx = 0; + if (n_mvs > 1) { + const int drl_ctx_v1 = get_drl_context(mvstack, 0); +- b->drl_idx += msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.drl_bit[drl_ctx_v1]); ++ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.drl_bit[drl_ctx_v1]); + if (b->drl_idx == 1 && n_mvs > 2) { + const int drl_ctx_v2 = get_drl_context(mvstack, 1); +- b->drl_idx += msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.drl_bit[drl_ctx_v2]); ++ b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.drl_bit[drl_ctx_v2]); + } + } + if (n_mvs > 1) { +@@ -1711,18 +1734,19 @@ static int decode_b(Dav1dTileContext *const t, + const int ii_sz_grp = dav1d_ymode_size_context[bs]; + if (f->seq_hdr->inter_intra && + interintra_allowed_mask & (1 << bs) && +- msac_decode_bool_adapt(&ts->msac, ts->cdf.m.interintra[ii_sz_grp])) ++ dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.interintra[ii_sz_grp])) + { +- b->interintra_mode = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.interintra_mode[ii_sz_grp], +- N_INTER_INTRA_PRED_MODES); ++ b->interintra_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.interintra_mode[ii_sz_grp], ++ N_INTER_INTRA_PRED_MODES); + const int wedge_ctx = dav1d_wedge_ctx_lut[bs]; + b->interintra_type = INTER_INTRA_BLEND + +- msac_decode_bool_adapt(&ts->msac, +- ts->cdf.m.interintra_wedge[wedge_ctx]); ++ dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.m.interintra_wedge[wedge_ctx]); + if (b->interintra_type == INTER_INTRA_WEDGE) +- b->wedge_idx = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.wedge_idx[wedge_ctx], 16); ++ b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.wedge_idx[wedge_ctx], 16); + } else { + b->interintra_type = INTER_INTRA_NONE; + } +@@ -1754,8 +1778,9 @@ static int decode_b(Dav1dTileContext *const t, + f->frame_hdr->warp_motion && (mask[0] | mask[1]); + + b->motion_mode = allow_warp ? +- msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.motion_mode[bs], 3) : +- msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]); ++ dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.motion_mode[bs], 3) : ++ dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]); + if (b->motion_mode == MM_WARP) { + has_subpel_filter = 0; + derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv); +@@ -1792,16 +1817,18 @@ static int decode_b(Dav1dTileContext *const t, + const int comp = b->comp_type != COMP_INTER_NONE; + const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0], + by4, bx4); +- filter[0] = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.filter[0][ctx1], DAV1D_N_SWITCHABLE_FILTERS); ++ filter[0] = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.filter[0][ctx1], ++ DAV1D_N_SWITCHABLE_FILTERS); + if (f->seq_hdr->dual_filter) { + const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1, + b->ref[0], by4, bx4); + if (DEBUG_BLOCK_INFO) + printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n", + filter[0], ctx1, ts->msac.rng); +- filter[1] = msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.filter[1][ctx2], DAV1D_N_SWITCHABLE_FILTERS); ++ filter[1] = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.filter[1][ctx2], ++ DAV1D_N_SWITCHABLE_FILTERS); + if (DEBUG_BLOCK_INFO) + printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n", + filter[1], ctx2, ts->msac.rng); +@@ -1994,7 +2021,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, + } else { + const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS : + bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS; +- bp = msac_decode_symbol_adapt(&t->ts->msac, pc, n_part); ++ bp = dav1d_msac_decode_symbol_adapt(&t->ts->msac, pc, n_part); + if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && + (bp == PARTITION_V || bp == PARTITION_V4 || + bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT)) +@@ -2165,7 +2192,8 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, + const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; + is_split = b->bl != bl; + } else { +- is_split = msac_decode_bool(&t->ts->msac, gather_top_partition_prob(pc, bl) >> EC_PROB_SHIFT); ++ is_split = dav1d_msac_decode_bool(&t->ts->msac, ++ gather_top_partition_prob(pc, bl)); + if (DEBUG_BLOCK_INFO) + printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n", + f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, +@@ -2193,7 +2221,8 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, + const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; + is_split = b->bl != bl; + } else { +- is_split = msac_decode_bool(&t->ts->msac, gather_left_partition_prob(pc, bl) >> EC_PROB_SHIFT); ++ is_split = dav1d_msac_decode_bool(&t->ts->msac, ++ gather_left_partition_prob(pc, bl)); + if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split) + return 1; + if (DEBUG_BLOCK_INFO) +@@ -2275,7 +2304,7 @@ static void setup_tile(Dav1dTileState *const ts, + ts->last_qidx = f->frame_hdr->quant.yac; + memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf)); + +- msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update); ++ dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update); + + ts->tiling.row = tile_row; + ts->tiling.col = tile_col; +@@ -2336,49 +2365,39 @@ static void read_restoration_info(Dav1dTileContext *const t, + Dav1dTileState *const ts = t->ts; + + if (frame_type == DAV1D_RESTORATION_SWITCHABLE) { +- const int filter = +- msac_decode_symbol_adapt(&ts->msac, +- ts->cdf.m.restore_switchable, 3); ++ const int filter = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ ts->cdf.m.restore_switchable, 3); + lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ : + DAV1D_RESTORATION_WIENER : +- DAV1D_RESTORATION_NONE; ++ DAV1D_RESTORATION_NONE; + } else { + const unsigned type = +- msac_decode_bool_adapt(&ts->msac, +- frame_type == DAV1D_RESTORATION_WIENER ? +- ts->cdf.m.restore_wiener : +- ts->cdf.m.restore_sgrproj); ++ dav1d_msac_decode_bool_adapt(&ts->msac, ++ frame_type == DAV1D_RESTORATION_WIENER ? ++ ts->cdf.m.restore_wiener : ts->cdf.m.restore_sgrproj); + lr->type = type ? frame_type : DAV1D_RESTORATION_NONE; + } + + if (lr->type == DAV1D_RESTORATION_WIENER) { +- lr->filter_v[0] = +- !p ? msac_decode_subexp(&ts->msac, +- ts->lr_ref[p]->filter_v[0] + 5, 16, +- 1) - 5: +- 0; ++ lr->filter_v[0] = p ? 0 : ++ dav1d_msac_decode_subexp(&ts->msac, ++ ts->lr_ref[p]->filter_v[0] + 5, 16, 1) - 5; + lr->filter_v[1] = +- msac_decode_subexp(&ts->msac, +- ts->lr_ref[p]->filter_v[1] + 23, 32, +- 2) - 23; ++ dav1d_msac_decode_subexp(&ts->msac, ++ ts->lr_ref[p]->filter_v[1] + 23, 32, 2) - 23; + lr->filter_v[2] = +- msac_decode_subexp(&ts->msac, +- ts->lr_ref[p]->filter_v[2] + 17, 64, +- 3) - 17; +- +- lr->filter_h[0] = +- !p ? msac_decode_subexp(&ts->msac, +- ts->lr_ref[p]->filter_h[0] + 5, 16, +- 1) - 5: +- 0; ++ dav1d_msac_decode_subexp(&ts->msac, ++ ts->lr_ref[p]->filter_v[2] + 17, 64, 3) - 17; ++ ++ lr->filter_h[0] = p ? 0 : ++ dav1d_msac_decode_subexp(&ts->msac, ++ ts->lr_ref[p]->filter_h[0] + 5, 16, 1) - 5; + lr->filter_h[1] = +- msac_decode_subexp(&ts->msac, +- ts->lr_ref[p]->filter_h[1] + 23, 32, +- 2) - 23; ++ dav1d_msac_decode_subexp(&ts->msac, ++ ts->lr_ref[p]->filter_h[1] + 23, 32, 2) - 23; + lr->filter_h[2] = +- msac_decode_subexp(&ts->msac, +- ts->lr_ref[p]->filter_h[2] + 17, 64, +- 3) - 17; ++ dav1d_msac_decode_subexp(&ts->msac, ++ ts->lr_ref[p]->filter_h[2] + 17, 64, 3) - 17; + memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights)); + ts->lr_ref[p] = lr; + if (DEBUG_BLOCK_INFO) +@@ -2387,18 +2406,16 @@ static void read_restoration_info(Dav1dTileContext *const t, + lr->filter_v[2], lr->filter_h[0], + lr->filter_h[1], lr->filter_h[2], ts->msac.rng); + } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) { +- const unsigned idx = msac_decode_bools(&ts->msac, 4); ++ const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4); + lr->sgr_idx = idx; + lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ? +- msac_decode_subexp(&ts->msac, +- ts->lr_ref[p]->sgr_weights[0] + 96, 128, +- 4) - 96 : ++ dav1d_msac_decode_subexp(&ts->msac, ++ ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : + 0; + lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ? +- msac_decode_subexp(&ts->msac, +- ts->lr_ref[p]->sgr_weights[1] + 32, 128, +- 4) - 32 : +- iclip(128 - lr->sgr_weights[0], -32, 95); ++ dav1d_msac_decode_subexp(&ts->msac, ++ ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 : ++ 95; + memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v)); + memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h)); + ts->lr_ref[p] = lr; +@@ -2593,8 +2610,12 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { + n < f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; n++) + { + Dav1dTileState *const ts = &f->ts[n]; +- pthread_mutex_init(&ts->tile_thread.lock, NULL); +- pthread_cond_init(&ts->tile_thread.cond, NULL); ++ if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error; ++ if (pthread_cond_init(&ts->tile_thread.cond, NULL)) { ++ pthread_mutex_destroy(&ts->tile_thread.lock); ++ goto error; ++ } ++ f->n_ts = n + 1; + } + if (c->n_fc > 1) { + freep(&f->frame_thread.tile_start_off); +@@ -2662,7 +2683,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { + lr_ptr += lr_stride * 12; + } + +- f->lf.lr_line_sz = lr_stride; ++ f->lf.lr_line_sz = (int) lr_stride; + } + + // update allocation for loopfilter masks +@@ -2679,11 +2700,12 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { + freep(&f->frame_thread.cbi); + dav1d_freep_aligned(&f->frame_thread.cf); + dav1d_freep_aligned(&f->frame_thread.pal_idx); +- freep(&f->frame_thread.pal); ++ dav1d_freep_aligned(&f->frame_thread.pal); + f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) * + f->sb128w * f->sb128h * 32 * 32); +- f->frame_thread.pal = malloc(sizeof(*f->frame_thread.pal) * +- f->sb128w * f->sb128h * 16 * 16); ++ f->frame_thread.pal = ++ dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) * ++ f->sb128w * f->sb128h * 16 * 16, 32); + f->frame_thread.pal_idx = + dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) * + f->sb128w * f->sb128h * 128 * 128 * 2, 32); +@@ -3089,7 +3111,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { + #endif + #undef assign_bitdepth_case + default: +- fprintf(stderr, "Compiled without support for %d-bit decoding\n", ++ dav1d_log(c, "Compiled without support for %d-bit decoding\n", + 8 + 2 * f->seq_hdr->hbd); + res = -ENOPROTOOPT; + goto error; +@@ -3172,23 +3194,24 @@ int dav1d_submit_frame(Dav1dContext *const c) { + } + + // FIXME qsort so tiles are in order (for frame threading) ++ if (f->n_tile_data_alloc < c->n_tile_data) { ++ freep(&f->tile); ++ assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile)); ++ f->tile = malloc(c->n_tile_data * sizeof(*f->tile)); ++ if (!f->tile) goto error; ++ f->n_tile_data_alloc = c->n_tile_data; ++ } + memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile)); + memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile)); + f->n_tile_data = c->n_tile_data; + c->n_tile_data = 0; + + // allocate frame +- res = dav1d_thread_picture_alloc(&f->sr_cur, f->frame_hdr->width[1], +- f->frame_hdr->height, +- f->seq_hdr, f->seq_hdr_ref, +- f->frame_hdr, f->frame_hdr_ref, +- bpc, &f->tile[0].data.m, +- c->n_fc > 1 ? &f->frame_thread.td : NULL, +- f->frame_hdr->show_frame, &c->allocator); ++ res = dav1d_thread_picture_alloc(c, f, bpc); + if (res < 0) goto error; + + if (f->frame_hdr->super_res.enabled) { +- res = dav1d_picture_alloc_copy(&f->cur, f->frame_hdr->width[0], &f->sr_cur.p); ++ res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p); + if (res < 0) goto error; + } else { + dav1d_picture_ref(&f->cur, &f->sr_cur.p); +diff --git third_party/dav1d/src/decode.h third_party/dav1d/src/decode.h +index 82b4d512210d..1eae5850a520 100644 +--- third_party/dav1d/src/decode.h ++++ third_party/dav1d/src/decode.h +@@ -25,11 +25,11 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_DECODE_H__ +-#define __DAV1D_SRC_DECODE_H__ ++#ifndef DAV1D_SRC_DECODE_H ++#define DAV1D_SRC_DECODE_H + + #include "src/internal.h" + + int dav1d_submit_frame(Dav1dContext *c); + +-#endif /* __DAV1D_SRC_DECODE_H__ */ ++#endif /* DAV1D_SRC_DECODE_H */ +diff --git third_party/dav1d/src/dequant_tables.h third_party/dav1d/src/dequant_tables.h +index 82b4fdc9ff23..66bb3b53a4d2 100644 +--- third_party/dav1d/src/dequant_tables.h ++++ third_party/dav1d/src/dequant_tables.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_DEQUANT_TABLES_H__ +-#define __DAV1D_SRC_DEQUANT_TABLES_H__ ++#ifndef DAV1D_SRC_DEQUANT_TABLES_H ++#define DAV1D_SRC_DEQUANT_TABLES_H + + #include + +@@ -34,4 +34,4 @@ + + extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2]; + +-#endif /* __DAV1D_SRC_DEQUANT_TABLES_H__ */ ++#endif /* DAV1D_SRC_DEQUANT_TABLES_H */ +diff --git third_party/dav1d/src/env.h third_party/dav1d/src/env.h +index b048100d00b5..2d4cc267df7c 100644 +--- third_party/dav1d/src/env.h ++++ third_party/dav1d/src/env.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_ENV_H__ +-#define __DAV1D_SRC_ENV_H__ ++#ifndef DAV1D_SRC_ENV_H ++#define DAV1D_SRC_ENV_H + + #include + #include +@@ -602,8 +602,8 @@ static inline int get_coef_skip_ctx(const TxfmInfo *const t_dim, + } + #undef MERGE_CTX + +- const int max = imin(la | ll, 4); +- const int min = imin(imin(la, ll), 4); ++ const int max = imin((int) (la | ll), 4); ++ const int min = imin(imin((int) la, (int) ll), 4); + + return skip_contexts[min][max]; + } +@@ -754,4 +754,4 @@ static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv, + } + } + +-#endif /* __DAV1D_SRC_ENV_H__ */ ++#endif /* DAV1D_SRC_ENV_H */ +diff --git third_party/dav1d/src/ext/x86/x86inc.asm third_party/dav1d/src/ext/x86/x86inc.asm +index b249f2a792dd..8d39aa77474c 100644 +--- third_party/dav1d/src/ext/x86/x86inc.asm ++++ third_party/dav1d/src/ext/x86/x86inc.asm +@@ -1,7 +1,7 @@ + ;***************************************************************************** + ;* x86inc.asm: x264asm abstraction layer + ;***************************************************************************** +-;* Copyright (C) 2005-2018 x264 project ++;* Copyright (C) 2005-2019 x264 project + ;* + ;* Authors: Loren Merritt + ;* Henrik Gramner +@@ -65,12 +65,19 @@ + %endif + + %define FORMAT_ELF 0 ++%define FORMAT_MACHO 0 + %ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 + %elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 + %elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 ++%elifidn __OUTPUT_FORMAT__,macho ++ %define FORMAT_MACHO 1 ++%elifidn __OUTPUT_FORMAT__,macho32 ++ %define FORMAT_MACHO 1 ++%elifidn __OUTPUT_FORMAT__,macho64 ++ %define FORMAT_MACHO 1 + %endif + + %ifdef PREFIX +@@ -98,8 +105,12 @@ + %define PIC 0 + %endif + ++%define HAVE_PRIVATE_EXTERN 1 + %ifdef __NASM_VER__ + %use smartalign ++ %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 ++ %define HAVE_PRIVATE_EXTERN 0 ++ %endif + %endif + + ; Macros to eliminate most code duplication between x86_32 and x86_64: +@@ -712,22 +723,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, + %endmacro + %macro cglobal_internal 2-3+ + annotate_function_size +- %if %1 +- %xdefine %%FUNCTION_PREFIX private_prefix +- %xdefine %%VISIBILITY hidden +- %else +- %xdefine %%FUNCTION_PREFIX public_prefix +- %xdefine %%VISIBILITY +- %endif + %ifndef cglobaled_%2 +- %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) ++ %if %1 ++ %xdefine %2 mangle(private_prefix %+ _ %+ %2) ++ %else ++ %xdefine %2 mangle(public_prefix %+ _ %+ %2) ++ %endif + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %xdefine current_function_section __SECT__ + %if FORMAT_ELF +- global %2:function %%VISIBILITY ++ %if %1 ++ global %2:function hidden ++ %else ++ global %2:function ++ %endif ++ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 ++ global %2:private_extern + %else + global %2 + %endif +@@ -748,6 +762,8 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, + %macro cglobal_label 1 + %if FORMAT_ELF + global current_function %+ %1:function hidden ++ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN ++ global current_function %+ %1:private_extern + %else + global current_function %+ %1 + %endif +@@ -773,6 +789,8 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %if FORMAT_ELF + global %1:data hidden ++ %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN ++ global %1:private_extern + %else + global %1 + %endif +@@ -817,19 +835,20 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, + %assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 + %assign cpuflags_sse42 (1<<11)| cpuflags_sse4 + %assign cpuflags_aesni (1<<12)| cpuflags_sse42 +-%assign cpuflags_avx (1<<13)| cpuflags_sse42 +-%assign cpuflags_xop (1<<14)| cpuflags_avx +-%assign cpuflags_fma4 (1<<15)| cpuflags_avx +-%assign cpuflags_fma3 (1<<16)| cpuflags_avx +-%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt +-%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1 +-%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2 +-%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL +- +-%assign cpuflags_cache32 (1<<21) +-%assign cpuflags_cache64 (1<<22) +-%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant +-%assign cpuflags_atom (1<<24) ++%assign cpuflags_gfni (1<<13)| cpuflags_sse42 ++%assign cpuflags_avx (1<<14)| cpuflags_sse42 ++%assign cpuflags_xop (1<<15)| cpuflags_avx ++%assign cpuflags_fma4 (1<<16)| cpuflags_avx ++%assign cpuflags_fma3 (1<<17)| cpuflags_avx ++%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt ++%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1 ++%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2 ++%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL ++ ++%assign cpuflags_cache32 (1<<22) ++%assign cpuflags_cache64 (1<<23) ++%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant ++%assign cpuflags_atom (1<<25) + + ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. + %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) +@@ -1221,8 +1240,16 @@ INIT_XMM + %ifdef cpuname + %if notcpuflag(%2) + %error use of ``%1'' %2 instruction in cpuname function: current_function +- %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 ++ %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) + %error use of ``%1'' sse2 instruction in cpuname function: current_function ++ %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) ++ %error use of ``%1'' avx2 instruction in cpuname function: current_function ++ %elifidn %1, pextrw ; special case because the base instruction is mmx2, ++ %ifnid %6 ; but sse4 is required for memory operands ++ %if notcpuflag(sse4) ++ %error use of ``%1'' sse4 instruction in cpuname function: current_function ++ %endif ++ %endif + %endif + %endif + %endif +@@ -1384,38 +1411,41 @@ AVX_INSTR cmpunordpd, sse2, 1, 0, 1 + AVX_INSTR cmpunordps, sse, 1, 0, 1 + AVX_INSTR cmpunordsd, sse2, 1, 0, 0 + AVX_INSTR cmpunordss, sse, 1, 0, 0 +-AVX_INSTR comisd, sse2 +-AVX_INSTR comiss, sse +-AVX_INSTR cvtdq2pd, sse2 +-AVX_INSTR cvtdq2ps, sse2 +-AVX_INSTR cvtpd2dq, sse2 +-AVX_INSTR cvtpd2ps, sse2 +-AVX_INSTR cvtps2dq, sse2 +-AVX_INSTR cvtps2pd, sse2 +-AVX_INSTR cvtsd2si, sse2 ++AVX_INSTR comisd, sse2, 1 ++AVX_INSTR comiss, sse, 1 ++AVX_INSTR cvtdq2pd, sse2, 1 ++AVX_INSTR cvtdq2ps, sse2, 1 ++AVX_INSTR cvtpd2dq, sse2, 1 ++AVX_INSTR cvtpd2ps, sse2, 1 ++AVX_INSTR cvtps2dq, sse2, 1 ++AVX_INSTR cvtps2pd, sse2, 1 ++AVX_INSTR cvtsd2si, sse2, 1 + AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 + AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 + AVX_INSTR cvtsi2ss, sse, 1, 0, 0 + AVX_INSTR cvtss2sd, sse2, 1, 0, 0 +-AVX_INSTR cvtss2si, sse +-AVX_INSTR cvttpd2dq, sse2 +-AVX_INSTR cvttps2dq, sse2 +-AVX_INSTR cvttsd2si, sse2 +-AVX_INSTR cvttss2si, sse ++AVX_INSTR cvtss2si, sse, 1 ++AVX_INSTR cvttpd2dq, sse2, 1 ++AVX_INSTR cvttps2dq, sse2, 1 ++AVX_INSTR cvttsd2si, sse2, 1 ++AVX_INSTR cvttss2si, sse, 1 + AVX_INSTR divpd, sse2, 1, 0, 0 + AVX_INSTR divps, sse, 1, 0, 0 + AVX_INSTR divsd, sse2, 1, 0, 0 + AVX_INSTR divss, sse, 1, 0, 0 + AVX_INSTR dppd, sse4, 1, 1, 0 + AVX_INSTR dpps, sse4, 1, 1, 0 +-AVX_INSTR extractps, sse4 ++AVX_INSTR extractps, sse4, 1 ++AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 ++AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 ++AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 + AVX_INSTR haddpd, sse3, 1, 0, 0 + AVX_INSTR haddps, sse3, 1, 0, 0 + AVX_INSTR hsubpd, sse3, 1, 0, 0 + AVX_INSTR hsubps, sse3, 1, 0, 0 + AVX_INSTR insertps, sse4, 1, 1, 0 + AVX_INSTR lddqu, sse3 +-AVX_INSTR ldmxcsr, sse ++AVX_INSTR ldmxcsr, sse, 1 + AVX_INSTR maskmovdqu, sse2 + AVX_INSTR maxpd, sse2, 1, 0, 1 + AVX_INSTR maxps, sse, 1, 0, 1 +@@ -1425,10 +1455,10 @@ AVX_INSTR minpd, sse2, 1, 0, 1 + AVX_INSTR minps, sse, 1, 0, 1 + AVX_INSTR minsd, sse2, 1, 0, 0 + AVX_INSTR minss, sse, 1, 0, 0 +-AVX_INSTR movapd, sse2 +-AVX_INSTR movaps, sse ++AVX_INSTR movapd, sse2, 1 ++AVX_INSTR movaps, sse, 1 + AVX_INSTR movd, mmx +-AVX_INSTR movddup, sse3 ++AVX_INSTR movddup, sse3, 1 + AVX_INSTR movdqa, sse2 + AVX_INSTR movdqu, sse2 + AVX_INSTR movhlps, sse, 1, 0, 0 +@@ -1437,19 +1467,19 @@ AVX_INSTR movhps, sse, 1, 0, 0 + AVX_INSTR movlhps, sse, 1, 0, 0 + AVX_INSTR movlpd, sse2, 1, 0, 0 + AVX_INSTR movlps, sse, 1, 0, 0 +-AVX_INSTR movmskpd, sse2 +-AVX_INSTR movmskps, sse ++AVX_INSTR movmskpd, sse2, 1 ++AVX_INSTR movmskps, sse, 1 + AVX_INSTR movntdq, sse2 + AVX_INSTR movntdqa, sse4 +-AVX_INSTR movntpd, sse2 +-AVX_INSTR movntps, sse ++AVX_INSTR movntpd, sse2, 1 ++AVX_INSTR movntps, sse, 1 + AVX_INSTR movq, mmx + AVX_INSTR movsd, sse2, 1, 0, 0 +-AVX_INSTR movshdup, sse3 +-AVX_INSTR movsldup, sse3 ++AVX_INSTR movshdup, sse3, 1 ++AVX_INSTR movsldup, sse3, 1 + AVX_INSTR movss, sse, 1, 0, 0 +-AVX_INSTR movupd, sse2 +-AVX_INSTR movups, sse ++AVX_INSTR movupd, sse2, 1 ++AVX_INSTR movups, sse, 1 + AVX_INSTR mpsadbw, sse4, 0, 1, 0 + AVX_INSTR mulpd, sse2, 1, 0, 1 + AVX_INSTR mulps, sse, 1, 0, 1 +@@ -1582,27 +1612,27 @@ AVX_INSTR punpcklwd, mmx, 0, 0, 0 + AVX_INSTR punpckldq, mmx, 0, 0, 0 + AVX_INSTR punpcklqdq, sse2, 0, 0, 0 + AVX_INSTR pxor, mmx, 0, 0, 1 +-AVX_INSTR rcpps, sse ++AVX_INSTR rcpps, sse, 1 + AVX_INSTR rcpss, sse, 1, 0, 0 +-AVX_INSTR roundpd, sse4 +-AVX_INSTR roundps, sse4 ++AVX_INSTR roundpd, sse4, 1 ++AVX_INSTR roundps, sse4, 1 + AVX_INSTR roundsd, sse4, 1, 1, 0 + AVX_INSTR roundss, sse4, 1, 1, 0 +-AVX_INSTR rsqrtps, sse ++AVX_INSTR rsqrtps, sse, 1 + AVX_INSTR rsqrtss, sse, 1, 0, 0 + AVX_INSTR shufpd, sse2, 1, 1, 0 + AVX_INSTR shufps, sse, 1, 1, 0 +-AVX_INSTR sqrtpd, sse2 +-AVX_INSTR sqrtps, sse ++AVX_INSTR sqrtpd, sse2, 1 ++AVX_INSTR sqrtps, sse, 1 + AVX_INSTR sqrtsd, sse2, 1, 0, 0 + AVX_INSTR sqrtss, sse, 1, 0, 0 +-AVX_INSTR stmxcsr, sse ++AVX_INSTR stmxcsr, sse, 1 + AVX_INSTR subpd, sse2, 1, 0, 0 + AVX_INSTR subps, sse, 1, 0, 0 + AVX_INSTR subsd, sse2, 1, 0, 0 + AVX_INSTR subss, sse, 1, 0, 0 +-AVX_INSTR ucomisd, sse2 +-AVX_INSTR ucomiss, sse ++AVX_INSTR ucomisd, sse2, 1 ++AVX_INSTR ucomiss, sse, 1 + AVX_INSTR unpckhpd, sse2, 1, 0, 0 + AVX_INSTR unpckhps, sse, 1, 0, 0 + AVX_INSTR unpcklpd, sse2, 1, 0, 0 +@@ -1615,6 +1645,37 @@ AVX_INSTR pfadd, 3dnow, 1, 0, 1 + AVX_INSTR pfsub, 3dnow, 1, 0, 0 + AVX_INSTR pfmul, 3dnow, 1, 0, 1 + ++;%1 == instruction ++;%2 == minimal instruction set ++%macro GPR_INSTR 2 ++ %macro %1 2-5 fnord, %1, %2 ++ %ifdef cpuname ++ %if notcpuflag(%5) ++ %error use of ``%4'' %5 instruction in cpuname function: current_function ++ %endif ++ %endif ++ %ifidn %3, fnord ++ %4 %1, %2 ++ %else ++ %4 %1, %2, %3 ++ %endif ++ %endmacro ++%endmacro ++ ++GPR_INSTR andn, bmi1 ++GPR_INSTR bextr, bmi1 ++GPR_INSTR blsi, bmi1 ++GPR_INSTR blsmsk, bmi1 ++GPR_INSTR bzhi, bmi2 ++GPR_INSTR mulx, bmi2 ++GPR_INSTR pdep, bmi2 ++GPR_INSTR pext, bmi2 ++GPR_INSTR popcnt, sse42 ++GPR_INSTR rorx, bmi2 ++GPR_INSTR sarx, bmi2 ++GPR_INSTR shlx, bmi2 ++GPR_INSTR shrx, bmi2 ++ + ; base-4 constants for shuffles + %assign i 0 + %rep 256 +diff --git third_party/dav1d/src/film_grain.h third_party/dav1d/src/film_grain.h +index 1519f12fde76..61e61d3937b8 100644 +--- third_party/dav1d/src/film_grain.h ++++ third_party/dav1d/src/film_grain.h +@@ -25,12 +25,12 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_FILM_GRAIN_H__ +-#define __DAV1D_SRC_FILM_GRAIN_H__ ++#ifndef DAV1D_SRC_FILM_GRAIN_H ++#define DAV1D_SRC_FILM_GRAIN_H + + #include "dav1d/dav1d.h" + + bitfn_decls(void dav1d_apply_grain, Dav1dPicture *const out, + const Dav1dPicture *const in); + +-#endif /* __DAV1D_SRC_FILM_GRAIN_H__ */ ++#endif /* DAV1D_SRC_FILM_GRAIN_H */ +diff --git third_party/dav1d/src/getbits.c third_party/dav1d/src/getbits.c +index 0a34601fa9c6..770fd33cf1ea 100644 +--- third_party/dav1d/src/getbits.c ++++ third_party/dav1d/src/getbits.c +@@ -70,7 +70,7 @@ unsigned dav1d_get_bits(GetBits *const c, const unsigned n) { + c->bits_left -= n; + c->state <<= n; + +- return state >> (64 - n); ++ return (unsigned) (state >> (64 - n)); + } + + int dav1d_get_sbits(GetBits *const c, const unsigned n) { +@@ -79,6 +79,27 @@ int dav1d_get_sbits(GetBits *const c, const unsigned n) { + return res >> shift; + } + ++unsigned dav1d_get_uleb128(GetBits *c) { ++ unsigned val = 0, more, i = 0; ++ ++ do { ++ more = dav1d_get_bits(c, 1); ++ unsigned bits = dav1d_get_bits(c, 7); ++ if (i <= 3 || (i == 4 && bits < (1 << 4))) ++ val |= bits << (i * 7); ++ else if (bits) { ++ c->error = 1; ++ return 0; ++ } ++ if (more && ++i == 8) { ++ c->error = 1; ++ return 0; ++ } ++ } while (more); ++ ++ return val; ++} ++ + unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) { + // Output in range [0..max-1] + // max must be > 1, or else nothing is read from the bitstream +diff --git third_party/dav1d/src/getbits.h third_party/dav1d/src/getbits.h +index d96810ae5334..fc382148b2e0 100644 +--- third_party/dav1d/src/getbits.h ++++ third_party/dav1d/src/getbits.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_GETBITS_H__ +-#define __DAV1D_SRC_GETBITS_H__ ++#ifndef DAV1D_SRC_GETBITS_H ++#define DAV1D_SRC_GETBITS_H + + #include + #include +@@ -41,6 +41,7 @@ typedef struct GetBits { + void dav1d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz); + unsigned dav1d_get_bits(GetBits *c, unsigned n); + int dav1d_get_sbits(GetBits *c, unsigned n); ++unsigned dav1d_get_uleb128(GetBits *c); + + // Output in range 0..max-1 + unsigned dav1d_get_uniform(GetBits *c, unsigned max); +@@ -52,7 +53,7 @@ void dav1d_bytealign_get_bits(GetBits *c); + + // Return the current bit position relative to the start of the buffer. + static inline unsigned dav1d_get_bits_pos(const GetBits *c) { +- return (c->ptr - c->ptr_start) * 8 - c->bits_left; ++ return (unsigned) (c->ptr - c->ptr_start) * 8 - c->bits_left; + } + +-#endif /* __DAV1D_SRC_GETBITS_H__ */ ++#endif /* DAV1D_SRC_GETBITS_H */ +diff --git third_party/dav1d/src/internal.h third_party/dav1d/src/internal.h +index 728cea18ed42..f0fe5efed899 100644 +--- third_party/dav1d/src/internal.h ++++ third_party/dav1d/src/internal.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_INTERNAL_H__ +-#define __DAV1D_SRC_INTERNAL_H__ ++#ifndef DAV1D_SRC_INTERNAL_H ++#define DAV1D_SRC_INTERNAL_H + + #include + +@@ -65,16 +65,19 @@ typedef struct Dav1dDSPContext { + Dav1dLoopRestorationDSPContext lr; + } Dav1dDSPContext; + ++struct Dav1dTileGroup { ++ Dav1dData data; ++ int start, end; ++}; ++ + struct Dav1dContext { + Dav1dFrameContext *fc; + unsigned n_fc; + + // cache of OBUs that make up a single frame before we submit them + // to a frame worker to be decoded +- struct { +- Dav1dData data; +- int start, end; +- } tile[256]; ++ struct Dav1dTileGroup *tile; ++ int n_tile_data_alloc; + int n_tile_data; + int n_tiles; + Dav1dRef *seq_hdr_ref; +@@ -82,6 +85,11 @@ struct Dav1dContext { + Dav1dRef *frame_hdr_ref; + Dav1dFrameHeader *frame_hdr; + ++ Dav1dRef *content_light_ref; ++ Dav1dContentLightLevel *content_light; ++ Dav1dRef *mastering_display_ref; ++ Dav1dMasteringDisplay *mastering_display; ++ + // decoded output picture queue + Dav1dData in; + Dav1dPicture out; +@@ -120,6 +128,8 @@ struct Dav1dContext { + unsigned operating_point_idc; + int all_layers; + int drain; ++ ++ Dav1dLogger logger; + }; + + struct Dav1dFrameContext { +@@ -139,10 +149,8 @@ struct Dav1dFrameContext { + unsigned refpoc[7], refrefpoc[7][7]; + uint8_t gmv_warp_allowed[7]; + CdfThreadContext in_cdf, out_cdf; +- struct { +- Dav1dData data; +- int start, end; +- } tile[256]; ++ struct Dav1dTileGroup *tile; ++ int n_tile_data_alloc; + int n_tile_data; + + // for scalable references +@@ -227,6 +235,7 @@ struct Dav1dFrameContext { + int tasks_left, num_tasks; + int (*task_idx_to_sby_and_tile_idx)[2]; + int titsati_sz, titsati_init[3]; ++ int inited; + } tile_thread; + }; + +@@ -270,7 +279,7 @@ struct Dav1dTileContext { + // FIXME types can be changed to pixel (and dynamically allocated) + // which would make copy/assign operations slightly faster? + uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */]; +- uint16_t pal[3 /* plane */][8 /* palette_idx */]; ++ ALIGN(uint16_t pal[3 /* plane */][8 /* palette_idx */], 16); + uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */]; + uint8_t txtp_map[32 * 32]; // inter-only + Dav1dWarpedMotionParams warpmv; +@@ -297,4 +306,4 @@ struct Dav1dTileContext { + } tile_thread; + }; + +-#endif /* __DAV1D_SRC_INTERNAL_H__ */ ++#endif /* DAV1D_SRC_INTERNAL_H */ +diff --git third_party/dav1d/src/intra_edge.h third_party/dav1d/src/intra_edge.h +index 8b0c6eebd536..8b4e15018188 100644 +--- third_party/dav1d/src/intra_edge.h ++++ third_party/dav1d/src/intra_edge.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_INTRA_EDGE_H__ +-#define __DAV1D_SRC_INTRA_EDGE_H__ ++#ifndef DAV1D_SRC_INTRA_EDGE_H ++#define DAV1D_SRC_INTRA_EDGE_H + + enum EdgeFlags { + EDGE_I444_TOP_HAS_RIGHT = 1 << 0, +@@ -54,4 +54,4 @@ typedef struct EdgeBranch { + void dav1d_init_mode_tree(EdgeNode *const root, EdgeTip *const nt, + const int allow_sb128); + +-#endif /* __DAV1D_SRC_INTRA_EDGE_H__ */ ++#endif /* DAV1D_SRC_INTRA_EDGE_H */ +diff --git third_party/dav1d/src/ipred.h third_party/dav1d/src/ipred.h +index 50fa8261d131..c980c22c5477 100644 +--- third_party/dav1d/src/ipred.h ++++ third_party/dav1d/src/ipred.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_IPRED_H__ +-#define __DAV1D_SRC_IPRED_H__ ++#ifndef DAV1D_SRC_IPRED_H ++#define DAV1D_SRC_IPRED_H + + #include + +@@ -91,4 +91,4 @@ typedef struct Dav1dIntraPredDSPContext { + bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c); + bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c); + +-#endif /* __DAV1D_SRC_IPRED_H__ */ ++#endif /* DAV1D_SRC_IPRED_H */ +diff --git third_party/dav1d/src/ipred_prepare.h third_party/dav1d/src/ipred_prepare.h +index e46e4d820b95..dbc88e5c95a4 100644 +--- third_party/dav1d/src/ipred_prepare.h ++++ third_party/dav1d/src/ipred_prepare.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_IPRED_PREPARE_H__ +-#define __DAV1D_SRC_IPRED_PREPARE_H__ ++#ifndef DAV1D_SRC_IPRED_PREPARE_H ++#define DAV1D_SRC_IPRED_PREPARE_H + + #include + #include +@@ -81,8 +81,8 @@ enum IntraPredMode + const pixel *dst, ptrdiff_t stride, + const pixel *prefilter_toplevel_sb_edge, + enum IntraPredMode mode, int *angle, +- int tw, int th, pixel *topleft_out +- HIGHBD_DECL_SUFFIX); ++ int tw, int th, int filter_edge, ++ pixel *topleft_out HIGHBD_DECL_SUFFIX); + + // These flags are OR'd with the angle argument into intra predictors. + // ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved +@@ -105,4 +105,4 @@ static inline int sm_uv_flag(const BlockContext *const b, const int idx) { + m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0; + } + +-#endif /* __DAV1D_SRC_IPRED_PREPARE_H__ */ ++#endif /* DAV1D_SRC_IPRED_PREPARE_H */ +diff --git third_party/dav1d/src/ipred_prepare_tmpl.c third_party/dav1d/src/ipred_prepare_tmpl.c +index 88d3cdd9b9da..fb4c74a099ee 100644 +--- third_party/dav1d/src/ipred_prepare_tmpl.c ++++ third_party/dav1d/src/ipred_prepare_tmpl.c +@@ -82,7 +82,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left, + const ptrdiff_t stride, + const pixel *prefilter_toplevel_sb_edge, + enum IntraPredMode mode, int *const angle, +- const int tw, const int th, ++ const int tw, const int th, const int filter_edge, + pixel *const topleft_out HIGHBD_DECL_SUFFIX) + { + const int bitdepth = bitdepth_from_max(bitdepth_max); +@@ -201,7 +201,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left, + } else { + *topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1; + } +- if (mode == Z2_PRED && tw + th >= 6) ++ if (mode == Z2_PRED && tw + th >= 6 && filter_edge) + *topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 + + topleft_out[1] * 5 + 8) >> 4; + } +diff --git third_party/dav1d/src/ipred_tmpl.c third_party/dav1d/src/ipred_tmpl.c +index dff3ec92bef4..a6eb999da359 100644 +--- third_party/dav1d/src/ipred_tmpl.c ++++ third_party/dav1d/src/ipred_tmpl.c +@@ -719,7 +719,7 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride, + { + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) +- dst[x] = pal[idx[x]]; ++ dst[x] = (pixel) pal[idx[x]]; + idx += w; + dst += PXSTRIDE(stride); + } +diff --git third_party/dav1d/src/itx.h third_party/dav1d/src/itx.h +index 9be1dd579883..ddb47aed0372 100644 +--- third_party/dav1d/src/itx.h ++++ third_party/dav1d/src/itx.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_ITX_H__ +-#define __DAV1D_SRC_ITX_H__ ++#ifndef DAV1D_SRC_ITX_H ++#define DAV1D_SRC_ITX_H + + #include + +@@ -46,4 +46,4 @@ typedef struct Dav1dInvTxfmDSPContext { + bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c); + bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c); + +-#endif /* __DAV1D_SRC_ITX_H__ */ ++#endif /* DAV1D_SRC_ITX_H */ +diff --git third_party/dav1d/src/itx_tmpl.c third_party/dav1d/src/itx_tmpl.c +index bec9b1b3e149..0b1bf1c4cc1f 100644 +--- third_party/dav1d/src/itx_tmpl.c ++++ third_party/dav1d/src/itx_tmpl.c +@@ -46,21 +46,40 @@ static void NOINLINE + inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, + coef *const coeff, const int eob, + const int w, const int h, const int shift1, const int shift2, +- const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn +- HIGHBD_DECL_SUFFIX) ++ const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn, ++ const int has_dconly HIGHBD_DECL_SUFFIX) + { + int i, j; +- const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32); + assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64)); +- // Maximum value for h and w is 64 +- coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */]; + const int is_rect2 = w * 2 == h || h * 2 == w; + const int bitdepth = bitdepth_from_max(bitdepth_max); ++ const int rnd1 = (1 << shift1) >> 1; ++ const int rnd2 = (1 << shift2) >> 1; ++ ++ if (has_dconly && eob == 0) { ++ int dc = coeff[0]; ++ coeff[0] = 0; ++ if (is_rect2) ++ dc = (dc * 2896 + 2048) >> 12; ++ dc = (dc * 2896 + 2048) >> 12; ++ dc = (dc + rnd1) >> shift1; ++ dc = (dc * 2896 + 2048) >> 12; ++ dc = (dc + rnd2) >> shift2; ++ for (j = 0; j < h; j++) ++ for (i = 0; i < w; i++) ++ dst[i + j * PXSTRIDE(stride)] = ++ iclip_pixel(dst[i + j * PXSTRIDE(stride)] + dc); ++ return; ++ } ++ assert(eob > 0 || (eob == 0 && !has_dconly)); ++ ++ const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32); ++ // Maximum value for h and w is 64 ++ coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */]; + const int row_clip_max = (1 << (bitdepth + 8 - 1)) - 1; + const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1; + + if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem)); +- const int rnd1 = (1 << shift1) >> 1; + for (i = 0; i < sh; i++) { + if (w != sw || is_rect2) { + for (j = 0; j < sw; j++) { +@@ -82,7 +101,6 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, + } + + if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp)); +- const int rnd2 = (1 << shift2) >> 1; + for (i = 0; i < w; i++) { + second_1d_fn(&tmp[i], w, out, 1, col_clip_max); + for (j = 0; j < h; j++) +@@ -93,7 +111,7 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride, + memset(coeff, 0, sizeof(*coeff) * sh * sw); + } + +-#define inv_txfm_fn(type1, type2, w, h, shift1, shift2) \ ++#define inv_txfm_fn(type1, type2, w, h, shift1, shift2, has_dconly) \ + static void \ + inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \ + const ptrdiff_t stride, \ +@@ -102,36 +120,36 @@ inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \ + HIGHBD_DECL_SUFFIX) \ + { \ + inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \ +- inv_##type1##w##_1d, inv_##type2##h##_1d \ ++ inv_##type1##w##_1d, inv_##type2##h##_1d, has_dconly \ + HIGHBD_TAIL_SUFFIX); \ + } + + #define inv_txfm_fn64(w, h, shift1, shift2) \ +-inv_txfm_fn(dct, dct, w, h, shift1, shift2) ++inv_txfm_fn(dct, dct, w, h, shift1, shift2, 1) + + #define inv_txfm_fn32(w, h, shift1, shift2) \ + inv_txfm_fn64(w, h, shift1, shift2) \ +-inv_txfm_fn(identity, identity, w, h, shift1, shift2) ++inv_txfm_fn(identity, identity, w, h, shift1, shift2, 0) + + #define inv_txfm_fn16(w, h, shift1, shift2) \ + inv_txfm_fn32(w, h, shift1, shift2) \ +-inv_txfm_fn(adst, dct, w, h, shift1, shift2) \ +-inv_txfm_fn(dct, adst, w, h, shift1, shift2) \ +-inv_txfm_fn(adst, adst, w, h, shift1, shift2) \ +-inv_txfm_fn(dct, flipadst, w, h, shift1, shift2) \ +-inv_txfm_fn(flipadst, dct, w, h, shift1, shift2) \ +-inv_txfm_fn(adst, flipadst, w, h, shift1, shift2) \ +-inv_txfm_fn(flipadst, adst, w, h, shift1, shift2) \ +-inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2) \ +-inv_txfm_fn(identity, dct, w, h, shift1, shift2) \ +-inv_txfm_fn(dct, identity, w, h, shift1, shift2) \ ++inv_txfm_fn(adst, dct, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(dct, adst, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(adst, adst, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(dct, flipadst, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(flipadst, dct, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(adst, flipadst, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(flipadst, adst, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(identity, dct, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(dct, identity, w, h, shift1, shift2, 0) \ + + #define inv_txfm_fn84(w, h, shift1, shift2) \ + inv_txfm_fn16(w, h, shift1, shift2) \ +-inv_txfm_fn(identity, flipadst, w, h, shift1, shift2) \ +-inv_txfm_fn(flipadst, identity, w, h, shift1, shift2) \ +-inv_txfm_fn(identity, adst, w, h, shift1, shift2) \ +-inv_txfm_fn(adst, identity, w, h, shift1, shift2) \ ++inv_txfm_fn(identity, flipadst, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(flipadst, identity, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(identity, adst, w, h, shift1, shift2, 0) \ ++inv_txfm_fn(adst, identity, w, h, shift1, shift2, 0) \ + + inv_txfm_fn84( 4, 4, 0, 4) + inv_txfm_fn84( 4, 8, 0, 4) +diff --git third_party/dav1d/src/levels.h third_party/dav1d/src/levels.h +index 5f762f024138..58d8233188a7 100644 +--- third_party/dav1d/src/levels.h ++++ third_party/dav1d/src/levels.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_LEVELS_H__ +-#define __DAV1D_SRC_LEVELS_H__ ++#ifndef DAV1D_SRC_LEVELS_H ++#define DAV1D_SRC_LEVELS_H + + #include + +@@ -43,6 +43,14 @@ enum ObuType { + OBU_PADDING = 15, + }; + ++enum ObuMetaType { ++ OBU_META_HDR_CLL = 1, ++ OBU_META_HDR_MDCV = 2, ++ OBU_META_SCALABILITY = 3, ++ OBU_META_ITUT_T35 = 4, ++ OBU_META_TIMECODE = 5, ++}; ++ + enum TxfmSize { + TX_4X4, + TX_8X8, +@@ -281,4 +289,4 @@ typedef struct Av1Block { + }; + } Av1Block; + +-#endif /* __DAV1D_SRC_LEVELS_H__ */ ++#endif /* DAV1D_SRC_LEVELS_H */ +diff --git third_party/dav1d/src/lf_apply.h third_party/dav1d/src/lf_apply.h +index 580aafb3649c..6b63b62a49c5 100644 +--- third_party/dav1d/src/lf_apply.h ++++ third_party/dav1d/src/lf_apply.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_LF_APPLY_H__ +-#define __DAV1D_SRC_LF_APPLY_H__ ++#ifndef DAV1D_SRC_LF_APPLY_H ++#define DAV1D_SRC_LF_APPLY_H + + #include + +@@ -39,4 +39,4 @@ void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *f, + pixel *const p[3], Av1Filter *lflvl, + int sby, int start_of_tile_row); + +-#endif /* __DAV1D_SRC_LF_APPLY_H__ */ ++#endif /* DAV1D_SRC_LF_APPLY_H */ +diff --git third_party/dav1d/src/lf_mask.c third_party/dav1d/src/lf_mask.c +index b5b3741f6122..b9dd7c3dba99 100644 +--- third_party/dav1d/src/lf_mask.c ++++ third_party/dav1d/src/lf_mask.c +@@ -177,7 +177,7 @@ static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2], + // inner (tx) left|right edges + const int hstep = t_dim->w; + unsigned t = 1U << by4; +- unsigned inner = (((uint64_t) t) << h4) - t; ++ unsigned inner = (unsigned) ((((uint64_t) t) << h4) - t); + unsigned inner1 = inner & 0xffff, inner2 = inner >> 16; + for (x = hstep; x < w4; x += hstep) { + if (inner1) masks[0][bx4 + x][twl4c][0] |= inner1; +@@ -189,7 +189,7 @@ static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2], + // bottom + const int vstep = t_dim->h; + t = 1U << bx4; +- inner = (((uint64_t) t) << w4) - t; ++ inner = (unsigned) ((((uint64_t) t) << w4) - t); + inner1 = inner & 0xffff; + inner2 = inner >> 16; + for (y = vstep; y < h4; y += vstep) { +@@ -248,7 +248,7 @@ static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2], + // inner (tx) left|right edges + const int hstep = t_dim->w; + unsigned t = 1U << cby4; +- unsigned inner = (((uint64_t) t) << ch4) - t; ++ unsigned inner = (unsigned) ((((uint64_t) t) << ch4) - t); + unsigned inner1 = inner & ((1 << vmask) - 1), inner2 = inner >> vmask; + for (x = hstep; x < cw4; x += hstep) { + if (inner1) masks[0][cbx4 + x][twl4c][0] |= inner1; +@@ -260,7 +260,7 @@ static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2], + // bottom + const int vstep = t_dim->h; + t = 1U << cbx4; +- inner = (((uint64_t) t) << cw4) - t; ++ inner = (unsigned) ((((uint64_t) t) << cw4) - t); + inner1 = inner & ((1 << hmask) - 1), inner2 = inner >> hmask; + for (y = vstep; y < ch4; y += vstep) { + if (inner1) masks[1][cby4 + y][thl4c][0] |= inner1; +diff --git third_party/dav1d/src/lf_mask.h third_party/dav1d/src/lf_mask.h +index 32124ecaa9ad..39574813f547 100644 +--- third_party/dav1d/src/lf_mask.h ++++ third_party/dav1d/src/lf_mask.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_LF_MASK_H__ +-#define __DAV1D_SRC_LF_MASK_H__ ++#ifndef DAV1D_SRC_LF_MASK_H ++#define DAV1D_SRC_LF_MASK_H + + #include + #include +@@ -82,4 +82,4 @@ void dav1d_calc_eih(Av1FilterLUT *lim_lut, int filter_sharpness); + void dav1d_calc_lf_values(uint8_t (*values)[4][8][2], const Dav1dFrameHeader *hdr, + const int8_t lf_delta[4]); + +-#endif /* __DAV1D_SRC_LF_MASK_H__ */ ++#endif /* DAV1D_SRC_LF_MASK_H */ +diff --git third_party/dav1d/src/lib.c third_party/dav1d/src/lib.c +index 01ff06d2638f..0088024db1fa 100644 +--- third_party/dav1d/src/lib.c ++++ third_party/dav1d/src/lib.c +@@ -26,7 +26,7 @@ + */ + + #include "config.h" +-#include "version.h" ++#include "vcs_version.h" + + #include + #include +@@ -38,6 +38,7 @@ + #include "common/validate.h" + + #include "src/internal.h" ++#include "src/log.h" + #include "src/obu.h" + #include "src/qm.h" + #include "src/ref.h" +@@ -62,10 +63,14 @@ void dav1d_default_settings(Dav1dSettings *const s) { + s->allocator.cookie = NULL; + s->allocator.alloc_picture_callback = default_picture_allocator; + s->allocator.release_picture_callback = default_picture_release; ++ s->logger.cookie = NULL; ++ s->logger.callback = dav1d_log_default_callback; + s->operating_point = 0; + s->all_layers = 1; // just until the tests are adjusted + } + ++static void close_internal(Dav1dContext **const c_out, int flush); ++ + int dav1d_open(Dav1dContext **const c_out, + const Dav1dSettings *const s) + { +@@ -90,6 +95,7 @@ int dav1d_open(Dav1dContext **const c_out, + memset(c, 0, sizeof(*c)); + + c->allocator = s->allocator; ++ c->logger = s->logger; + c->apply_grain = s->apply_grain; + c->operating_point = s->operating_point; + c->all_layers = s->all_layers; +@@ -115,9 +121,17 @@ int dav1d_open(Dav1dContext **const c_out, + if (!f->tc) goto error; + memset(f->tc, 0, sizeof(*f->tc) * s->n_tile_threads); + if (f->n_tc > 1) { +- pthread_mutex_init(&f->tile_thread.lock, NULL); +- pthread_cond_init(&f->tile_thread.cond, NULL); +- pthread_cond_init(&f->tile_thread.icond, NULL); ++ if (pthread_mutex_init(&f->tile_thread.lock, NULL)) goto error; ++ if (pthread_cond_init(&f->tile_thread.cond, NULL)) { ++ pthread_mutex_destroy(&f->tile_thread.lock); ++ goto error; ++ } ++ if (pthread_cond_init(&f->tile_thread.icond, NULL)) { ++ pthread_mutex_destroy(&f->tile_thread.lock); ++ pthread_cond_destroy(&f->tile_thread.cond); ++ goto error; ++ } ++ f->tile_thread.inited = 1; + } + for (int m = 0; m < s->n_tile_threads; m++) { + Dav1dTileContext *const t = &f->tc[m]; +@@ -131,18 +145,34 @@ int dav1d_open(Dav1dContext **const c_out, + dav1d_alloc_aligned(320 * (256 + 7) * sizeof(uint16_t), 32); + if (!t->emu_edge) goto error; + if (f->n_tc > 1) { +- pthread_mutex_init(&t->tile_thread.td.lock, NULL); +- pthread_cond_init(&t->tile_thread.td.cond, NULL); ++ if (pthread_mutex_init(&t->tile_thread.td.lock, NULL)) goto error; ++ if (pthread_cond_init(&t->tile_thread.td.cond, NULL)) { ++ pthread_mutex_destroy(&t->tile_thread.td.lock); ++ goto error; ++ } + t->tile_thread.fttd = &f->tile_thread; +- pthread_create(&t->tile_thread.td.thread, NULL, dav1d_tile_task, t); ++ if (pthread_create(&t->tile_thread.td.thread, NULL, dav1d_tile_task, t)) { ++ pthread_cond_destroy(&t->tile_thread.td.cond); ++ pthread_mutex_destroy(&t->tile_thread.td.lock); ++ goto error; ++ } ++ t->tile_thread.td.inited = 1; + } + } + f->libaom_cm = av1_alloc_ref_mv_common(); + if (!f->libaom_cm) goto error; + if (c->n_fc > 1) { +- pthread_mutex_init(&f->frame_thread.td.lock, NULL); +- pthread_cond_init(&f->frame_thread.td.cond, NULL); +- pthread_create(&f->frame_thread.td.thread, NULL, dav1d_frame_task, f); ++ if (pthread_mutex_init(&f->frame_thread.td.lock, NULL)) goto error; ++ if (pthread_cond_init(&f->frame_thread.td.cond, NULL)) { ++ pthread_mutex_destroy(&f->frame_thread.td.lock); ++ goto error; ++ } ++ if (pthread_create(&f->frame_thread.td.thread, NULL, dav1d_frame_task, f)) { ++ pthread_cond_destroy(&f->frame_thread.td.cond); ++ pthread_mutex_destroy(&f->frame_thread.td.lock); ++ goto error; ++ } ++ f->frame_thread.td.inited = 1; + } + } + +@@ -155,26 +185,7 @@ int dav1d_open(Dav1dContext **const c_out, + return 0; + + error: +- if (c) { +- if (c->fc) { +- for (unsigned n = 0; n < c->n_fc; n++) { +- if (c->fc[n].tc) { +- for (int m = 0; m < s->n_tile_threads; m++) { +- Dav1dTileContext *const t = &c->fc[n].tc[m]; +- dav1d_free_aligned(t->cf); +- dav1d_free_aligned(t->scratch.mem); +- dav1d_free_aligned(t->emu_edge); +- } +- dav1d_free_aligned(c->fc[n].tc); +- } +- if (c->fc[n].libaom_cm) av1_free_ref_mv_common(c->fc[n].libaom_cm); +- } +- dav1d_free_aligned(c->fc); +- } +- if (c->n_fc > 1) free(c->frame_thread.out_delayed); +- dav1d_freep_aligned(c_out); +- } +- fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno)); ++ if (c) close_internal(c_out, 0); + return -ENOMEM; + } + +@@ -192,6 +203,7 @@ int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out, + + Dav1dSettings s; + dav1d_default_settings(&s); ++ s.logger.callback = NULL; + + Dav1dContext *c; + res = dav1d_open(&c, &s); +@@ -255,7 +267,7 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out, + } + + // Apply film grain to a new copy of the image to avoid corrupting refs +- int res = dav1d_picture_alloc_copy(out, in->p.w, in); ++ int res = dav1d_picture_alloc_copy(c, out, in->p.w, in); + if (res < 0) { + dav1d_picture_unref_internal(in); + dav1d_picture_unref_internal(out); +@@ -372,6 +384,22 @@ void dav1d_flush(Dav1dContext *const c) { + dav1d_data_unref_internal(&c->in); + c->drain = 0; + ++ for (int i = 0; i < 8; i++) { ++ if (c->refs[i].p.p.data[0]) ++ dav1d_thread_picture_unref(&c->refs[i].p); ++ dav1d_ref_dec(&c->refs[i].segmap); ++ dav1d_ref_dec(&c->refs[i].refmvs); ++ dav1d_cdf_thread_unref(&c->cdf[i]); ++ } ++ c->frame_hdr = NULL; ++ c->seq_hdr = NULL; ++ dav1d_ref_dec(&c->seq_hdr_ref); ++ ++ c->mastering_display = NULL; ++ c->content_light = NULL; ++ dav1d_ref_dec(&c->mastering_display_ref); ++ dav1d_ref_dec(&c->content_light_ref); ++ + if (c->n_fc == 1) return; + + // mark each currently-running frame as flushing, so that we +@@ -394,32 +422,25 @@ void dav1d_flush(Dav1dContext *const c) { + } + atomic_store(c->frame_thread.flush, 0); + +- for (int i = 0; i < 8; i++) { +- if (c->refs[i].p.p.data[0]) +- dav1d_thread_picture_unref(&c->refs[i].p); +- dav1d_ref_dec(&c->refs[i].segmap); +- dav1d_ref_dec(&c->refs[i].refmvs); +- dav1d_cdf_thread_unref(&c->cdf[i]); +- } +- c->frame_hdr = NULL; +- c->seq_hdr = NULL; +- dav1d_ref_dec(&c->seq_hdr_ref); +- + c->frame_thread.next = 0; + } + + void dav1d_close(Dav1dContext **const c_out) { + validate_input(c_out != NULL); ++ close_internal(c_out, 1); ++} + ++static void close_internal(Dav1dContext **const c_out, int flush) { + Dav1dContext *const c = *c_out; + if (!c) return; + +- dav1d_flush(c); +- for (unsigned n = 0; n < c->n_fc; n++) { ++ if (flush) dav1d_flush(c); ++ ++ for (unsigned n = 0; c->fc && n < c->n_fc; n++) { + Dav1dFrameContext *const f = &c->fc[n]; + + // clean-up threading stuff +- if (c->n_fc > 1) { ++ if (c->n_fc > 1 && f->frame_thread.td.inited) { + pthread_mutex_lock(&f->frame_thread.td.lock); + f->frame_thread.die = 1; + pthread_cond_signal(&f->frame_thread.td.cond); +@@ -429,16 +450,19 @@ void dav1d_close(Dav1dContext **const c_out) { + dav1d_freep_aligned(&f->frame_thread.pal_idx); + dav1d_freep_aligned(&f->frame_thread.cf); + freep(&f->frame_thread.tile_start_off); +- freep(&f->frame_thread.pal); ++ dav1d_freep_aligned(&f->frame_thread.pal); + freep(&f->frame_thread.cbi); + pthread_mutex_destroy(&f->frame_thread.td.lock); + pthread_cond_destroy(&f->frame_thread.td.cond); + } +- if (f->n_tc > 1) { ++ if (f->n_tc > 1 && f->tc && f->tile_thread.inited) { + pthread_mutex_lock(&f->tile_thread.lock); + for (int m = 0; m < f->n_tc; m++) { + Dav1dTileContext *const t = &f->tc[m]; + t->tile_thread.die = 1; ++ // mark not created tile threads as available ++ if (!t->tile_thread.td.inited) ++ f->tile_thread.available |= 1ULL<tile_thread.cond); + while (f->tile_thread.available != ~0ULL >> (64 - f->n_tc)) +@@ -447,7 +471,7 @@ void dav1d_close(Dav1dContext **const c_out) { + pthread_mutex_unlock(&f->tile_thread.lock); + for (int m = 0; m < f->n_tc; m++) { + Dav1dTileContext *const t = &f->tc[m]; +- if (f->n_tc > 1) { ++ if (f->n_tc > 1 && t->tile_thread.td.inited) { + pthread_join(t->tile_thread.td.thread, NULL); + pthread_mutex_destroy(&t->tile_thread.td.lock); + pthread_cond_destroy(&t->tile_thread.td.cond); +@@ -458,13 +482,13 @@ void dav1d_close(Dav1dContext **const c_out) { + pthread_cond_destroy(&f->tile_thread.icond); + freep(&f->tile_thread.task_idx_to_sby_and_tile_idx); + } +- for (int m = 0; m < f->n_tc; m++) { ++ for (int m = 0; f->tc && m < f->n_tc; m++) { + Dav1dTileContext *const t = &f->tc[m]; + dav1d_free_aligned(t->cf); + dav1d_free_aligned(t->scratch.mem); + dav1d_free_aligned(t->emu_edge); + } +- for (int m = 0; m < f->n_ts; m++) { ++ for (int m = 0; f->ts && m < f->n_ts; m++) { + Dav1dTileState *const ts = &f->ts[m]; + pthread_cond_destroy(&ts->tile_thread.cond); + pthread_mutex_destroy(&ts->tile_thread.lock); +@@ -473,17 +497,18 @@ void dav1d_close(Dav1dContext **const c_out) { + dav1d_free_aligned(f->tc); + dav1d_free_aligned(f->ipred_edge[0]); + free(f->a); ++ free(f->tile); + free(f->lf.mask); + free(f->lf.lr_mask); + free(f->lf.level); + free(f->lf.tx_lpf_right_edge[0]); +- av1_free_ref_mv_common(f->libaom_cm); ++ if (f->libaom_cm) av1_free_ref_mv_common(f->libaom_cm); + dav1d_free_aligned(f->lf.cdef_line); + dav1d_free_aligned(f->lf.lr_lpf_line); + } + dav1d_free_aligned(c->fc); + dav1d_data_unref_internal(&c->in); +- if (c->n_fc > 1) { ++ if (c->n_fc > 1 && c->frame_thread.out_delayed) { + for (unsigned n = 0; n < c->n_fc; n++) + if (c->frame_thread.out_delayed[n].p.data[0]) + dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]); +@@ -491,6 +516,7 @@ void dav1d_close(Dav1dContext **const c_out) { + } + for (int n = 0; n < c->n_tile_data; n++) + dav1d_data_unref_internal(&c->tile[n].data); ++ free(c->tile); + for (int n = 0; n < 8; n++) { + dav1d_cdf_thread_unref(&c->cdf[n]); + if (c->refs[n].p.p.data[0]) +@@ -501,6 +527,9 @@ void dav1d_close(Dav1dContext **const c_out) { + dav1d_ref_dec(&c->seq_hdr_ref); + dav1d_ref_dec(&c->frame_hdr_ref); + ++ dav1d_ref_dec(&c->mastering_display_ref); ++ dav1d_ref_dec(&c->content_light_ref); ++ + dav1d_freep_aligned(c_out); + } + +diff --git third_party/dav1d/src/log.c third_party/dav1d/src/log.c +new file mode 100644 +index 000000000000..4eb4e913f28a +--- /dev/null ++++ third_party/dav1d/src/log.c +@@ -0,0 +1,57 @@ ++/* ++ * Copyright © 2018, VideoLAN and dav1d authors ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright notice, this ++ * list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above copyright notice, ++ * this list of conditions and the following disclaimer in the documentation ++ * and/or other materials provided with the distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "config.h" ++ ++#include ++#include ++ ++#include "dav1d/dav1d.h" ++ ++#include "common/validate.h" ++ ++#include "src/internal.h" ++#include "src/log.h" ++ ++void dav1d_log_default_callback(void *const cookie, ++ const char *const format, va_list ap) ++{ ++ vfprintf(stderr, format, ap); ++} ++ ++#if CONFIG_LOG ++void dav1d_log(Dav1dContext *const c, const char *const format, ...) { ++ validate_input(c != NULL); ++ ++ if (!c->logger.callback) ++ return; ++ ++ va_list ap; ++ va_start(ap, format); ++ c->logger.callback(c->logger.cookie, format, ap); ++ va_end(ap); ++} ++#endif +diff --git third_party/dav1d/src/log.h third_party/dav1d/src/log.h +new file mode 100644 +index 000000000000..8f6357cb6607 +--- /dev/null ++++ third_party/dav1d/src/log.h +@@ -0,0 +1,47 @@ ++/* ++ * Copyright © 2018, VideoLAN and dav1d authors ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright notice, this ++ * list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above copyright notice, ++ * this list of conditions and the following disclaimer in the documentation ++ * and/or other materials provided with the distribution. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef DAV1D_SRC_LOG_H ++#define DAV1D_SRC_LOG_H ++ ++#include "config.h" ++ ++#include ++ ++#include "dav1d/dav1d.h" ++ ++#include "common/attributes.h" ++ ++void dav1d_log_default_callback(void *cookie, const char *format, va_list ap); ++ ++#if CONFIG_LOG ++#define dav1d_log dav1d_log ++void dav1d_log(Dav1dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3); ++#else ++#define dav1d_log(...) do { } while(0) ++#endif ++ ++#endif /* DAV1D_SRC_LOG_H */ +diff --git third_party/dav1d/src/loopfilter.h third_party/dav1d/src/loopfilter.h +index ada94041e332..7f8fd134ba4d 100644 +--- third_party/dav1d/src/loopfilter.h ++++ third_party/dav1d/src/loopfilter.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_LOOPFILTER_H__ +-#define __DAV1D_SRC_LOOPFILTER_H__ ++#ifndef DAV1D_SRC_LOOPFILTER_H ++#define DAV1D_SRC_LOOPFILTER_H + + #include + #include +@@ -55,4 +55,4 @@ typedef struct Dav1dLoopFilterDSPContext { + bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c); + bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c); + +-#endif /* __DAV1D_SRC_LOOPFILTER_H__ */ ++#endif /* DAV1D_SRC_LOOPFILTER_H */ +diff --git third_party/dav1d/src/looprestoration.h third_party/dav1d/src/looprestoration.h +index 6bd8dd50780d..bf1dea075479 100644 +--- third_party/dav1d/src/looprestoration.h ++++ third_party/dav1d/src/looprestoration.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_LOOPRESTORATION_H__ +-#define __DAV1D_SRC_LOOPRESTORATION_H__ ++#ifndef DAV1D_SRC_LOOPRESTORATION_H ++#define DAV1D_SRC_LOOPRESTORATION_H + + #include + #include +@@ -76,4 +76,4 @@ bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext + bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c); + bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c); + +-#endif /* __DAV1D_SRC_LOOPRESTORATION_H__ */ ++#endif /* DAV1D_SRC_LOOPRESTORATION_H */ +diff --git third_party/dav1d/src/looprestoration_tmpl.c third_party/dav1d/src/looprestoration_tmpl.c +index c39e72e9a077..08af2b0f13e7 100644 +--- third_party/dav1d/src/looprestoration_tmpl.c ++++ third_party/dav1d/src/looprestoration_tmpl.c +@@ -446,11 +446,11 @@ static void selfguided_filter(coef *dst, const pixel *src, + + const unsigned p = imax(a * n - b * b, 0); + const unsigned z = (p * s + (1 << 19)) >> 20; ++ const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)]; + +- const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)]; + // This is where we invert A and B, so that B is of size coef. +- AA[i] = (((1U << 8) - x) * BB[i] * sgr_one_by_x + (1 << 11)) >> 12; +- BB[i] = x; ++ AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12; ++ BB[i] = 256 - x; + } + AA += step * REST_UNIT_STRIDE; + BB += step * REST_UNIT_STRIDE; +@@ -508,7 +508,7 @@ static void selfguided_filter(coef *dst, const pixel *src, + A += REST_UNIT_STRIDE; + } + } +-#undef NINE_NEIGHBORS ++#undef EIGHT_NEIGHBORS + } + + static void selfguided_c(pixel *p, const ptrdiff_t p_stride, +diff --git third_party/dav1d/src/lr_apply.h third_party/dav1d/src/lr_apply.h +index 67a7a210b563..638bb8b74b63 100644 +--- third_party/dav1d/src/lr_apply.h ++++ third_party/dav1d/src/lr_apply.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_LR_APPLY_H__ +-#define __DAV1D_SRC_LR_APPLY_H__ ++#ifndef DAV1D_SRC_LR_APPLY_H ++#define DAV1D_SRC_LR_APPLY_H + + #include + #include +@@ -41,4 +41,4 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, + void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3], + int sby); + +-#endif /* __DAV1D_SRC_LR_APPLY_H__ */ ++#endif /* DAV1D_SRC_LR_APPLY_H */ +diff --git third_party/dav1d/src/lr_apply_tmpl.c third_party/dav1d/src/lr_apply_tmpl.c +index 18141b05c7e4..ab2f8dd6cfb2 100644 +--- third_party/dav1d/src/lr_apply_tmpl.c ++++ third_party/dav1d/src/lr_apply_tmpl.c +@@ -107,7 +107,7 @@ static void backup_lpf(const Dav1dFrameContext *const f, + void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, + /*const*/ pixel *const src[3], const int sby) + { +- const ptrdiff_t offset = 8 * !!sby; ++ const int offset = 8 * !!sby; + const ptrdiff_t *const src_stride = f->cur.stride; + const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel); + +@@ -132,7 +132,7 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, + const int h = (f->cur.p.h + ss_ver) >> ss_ver; + const int w = f->bw << (2 - ss_hor); + const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1); +- const ptrdiff_t offset_uv = offset >> ss_ver; ++ const int offset_uv = offset >> ss_ver; + const int y_stripe = + (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; + +@@ -277,7 +277,7 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y, + void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3], + const int sby) + { +- const ptrdiff_t offset_y = 8 * !!sby; ++ const int offset_y = 8 * !!sby; + const ptrdiff_t *const dst_stride = f->sr_cur.p.stride; + + const int restore_planes = +@@ -299,7 +299,7 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3], + const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver; + const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; + const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h); +- const ptrdiff_t offset_uv = offset_y >> ss_ver; ++ const int offset_uv = offset_y >> ss_ver; + const int y_stripe = + (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; + if (restore_planes & LR_RESTORE_U) +diff --git third_party/dav1d/src/mc.h third_party/dav1d/src/mc.h +index 30172fa305da..33baea6b20fd 100644 +--- third_party/dav1d/src/mc.h ++++ third_party/dav1d/src/mc.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_MC_H__ +-#define __DAV1D_SRC_MC_H__ ++#ifndef DAV1D_SRC_MC_H ++#define DAV1D_SRC_MC_H + + #include + #include +@@ -135,4 +135,4 @@ bitfn_decls(void dav1d_mc_dsp_init, Dav1dMCDSPContext *c); + bitfn_decls(void dav1d_mc_dsp_init_arm, Dav1dMCDSPContext *c); + bitfn_decls(void dav1d_mc_dsp_init_x86, Dav1dMCDSPContext *c); + +-#endif /* __DAV1D_SRC_MC_H__ */ ++#endif /* DAV1D_SRC_MC_H */ +diff --git third_party/dav1d/src/mc_tmpl.c third_party/dav1d/src/mc_tmpl.c +index 93a6b3cd7c4d..b1fe67ac3ef2 100644 +--- third_party/dav1d/src/mc_tmpl.c ++++ third_party/dav1d/src/mc_tmpl.c +@@ -39,9 +39,14 @@ + + #if BITDEPTH == 8 + #define get_intermediate_bits(bitdepth_max) 4 ++// Output in interval [-5132, 9212], fits in int16_t as is ++#define PREP_BIAS 0 + #else + // 4 for 10 bits/component, 2 for 12 bits/component + #define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max)) ++// Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit) ++// Subtract a bias to ensure the output fits in int16_t ++#define PREP_BIAS 8192 + #endif + + static NOINLINE void +@@ -63,7 +68,7 @@ prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride, + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + do { + for (int x = 0; x < w; x++) +- tmp[x] = src[x] << intermediate_bits; ++ tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS; + + tmp += w; + src += src_stride; +@@ -237,8 +242,12 @@ prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + + mid_ptr = mid + 128 * 3; + do { +- for (int x = 0; x < w; x++) +- tmp[x] = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6); ++ for (int x = 0; x < w; x++) { ++ int t = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) - ++ PREP_BIAS; ++ assert(t >= INT16_MIN && t <= INT16_MAX); ++ tmp[x] = t; ++ } + + mid_ptr += 128; + tmp += w; +@@ -247,7 +256,8 @@ prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + do { + for (int x = 0; x < w; x++) + tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1, +- 6 - intermediate_bits); ++ 6 - intermediate_bits) - ++ PREP_BIAS; + + tmp += w; + src += src_stride; +@@ -257,7 +267,8 @@ prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + do { + for (int x = 0; x < w; x++) + tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride, +- 6 - intermediate_bits); ++ 6 - intermediate_bits) - ++ PREP_BIAS; + + tmp += w; + src += src_stride; +@@ -302,7 +313,8 @@ prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + GET_V_FILTER(my >> 6); + + for (x = 0; x < w; x++) +- tmp[x] = fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) : mid_ptr[x]; ++ tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) ++ : mid_ptr[x]) - PREP_BIAS; + + my += dy; + mid_ptr += (my >> 10) * 128; +@@ -499,7 +511,8 @@ static void prep_bilin_c(int16_t *tmp, + mid_ptr = mid; + do { + for (int x = 0; x < w; x++) +- tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4); ++ tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4) - ++ PREP_BIAS; + + mid_ptr += 128; + tmp += w; +@@ -508,7 +521,8 @@ static void prep_bilin_c(int16_t *tmp, + do { + for (int x = 0; x < w; x++) + tmp[x] = FILTER_BILIN_RND(src, x, mx, 1, +- 4 - intermediate_bits); ++ 4 - intermediate_bits) - ++ PREP_BIAS; + + tmp += w; + src += src_stride; +@@ -518,7 +532,7 @@ static void prep_bilin_c(int16_t *tmp, + do { + for (int x = 0; x < w; x++) + tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride, +- 4 - intermediate_bits); ++ 4 - intermediate_bits) - PREP_BIAS; + + tmp += w; + src += src_stride; +@@ -557,7 +571,7 @@ static void prep_bilin_scaled_c(int16_t *tmp, + int x; + + for (x = 0; x < w; x++) +- tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4); ++ tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4) - PREP_BIAS; + + my += dy; + mid_ptr += (my >> 10) * 128; +@@ -571,7 +585,8 @@ static void avg_c(pixel *dst, const ptrdiff_t dst_stride, + HIGHBD_DECL_SUFFIX) + { + const int intermediate_bits = get_intermediate_bits(bitdepth_max); +- const int sh = intermediate_bits + 1, rnd = 1 << intermediate_bits; ++ const int sh = intermediate_bits + 1; ++ const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2; + do { + for (int x = 0; x < w; x++) + dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh); +@@ -587,7 +602,8 @@ static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride, + const int weight HIGHBD_DECL_SUFFIX) + { + const int intermediate_bits = get_intermediate_bits(bitdepth_max); +- const int sh = intermediate_bits + 4, rnd = 8 << intermediate_bits; ++ const int sh = intermediate_bits + 4; ++ const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16; + do { + for (int x = 0; x < w; x++) + dst[x] = iclip_pixel((tmp1[x] * weight + +@@ -604,7 +620,8 @@ static void mask_c(pixel *dst, const ptrdiff_t dst_stride, + const uint8_t *mask HIGHBD_DECL_SUFFIX) + { + const int intermediate_bits = get_intermediate_bits(bitdepth_max); +- const int sh = intermediate_bits + 6, rnd = 32 << intermediate_bits; ++ const int sh = intermediate_bits + 6; ++ const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64; + do { + for (int x = 0; x < w; x++) + dst[x] = iclip_pixel((tmp1[x] * mask[x] + +@@ -668,7 +685,8 @@ static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, + // and then load this intermediate to calculate final value for odd rows + const int intermediate_bits = get_intermediate_bits(bitdepth_max); + const int bitdepth = bitdepth_from_max(bitdepth_max); +- const int sh = intermediate_bits + 6, rnd = 32 << intermediate_bits; ++ const int sh = intermediate_bits + 6; ++ const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64; + const int mask_sh = bitdepth + intermediate_bits - 4; + const int mask_rnd = 1 << (mask_sh - 5); + do { +@@ -719,6 +737,7 @@ w_mask_fns(420, 1, 1); + + #undef w_mask_fns + ++#if ARCH_X86 + #define FILTER_WARP(src, x, F, stride) \ + (F[0] * src[x + -3 * stride] + \ + F[4] * src[x + -2 * stride] + \ +@@ -728,6 +747,17 @@ w_mask_fns(420, 1, 1); + F[6] * src[x + +2 * stride] + \ + F[3] * src[x + +3 * stride] + \ + F[7] * src[x + +4 * stride]) ++#else ++#define FILTER_WARP(src, x, F, stride) \ ++ (F[0] * src[x + -3 * stride] + \ ++ F[1] * src[x + -2 * stride] + \ ++ F[2] * src[x + -1 * stride] + \ ++ F[3] * src[x + +0 * stride] + \ ++ F[4] * src[x + +1 * stride] + \ ++ F[5] * src[x + +2 * stride] + \ ++ F[6] * src[x + +3 * stride] + \ ++ F[7] * src[x + +4 * stride]) ++#endif + + #define FILTER_WARP_RND(src, x, F, stride, sh) \ + ((FILTER_WARP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh)) +@@ -797,7 +827,7 @@ static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride, + const int8_t *const filter = + dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)]; + +- tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7); ++ tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS; + } + mid_ptr += 8; + tmp += tmp_stride; +@@ -811,20 +841,21 @@ static void emu_edge_c(const intptr_t bw, const intptr_t bh, + const pixel *ref, const ptrdiff_t ref_stride) + { + // find offset in reference of visible block to copy +- ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1); ++ ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) + ++ iclip((int) x, 0, (int) iw - 1); + + // number of pixels to extend (left, right, top, bottom) +- const int left_ext = iclip(-x, 0, bw - 1); +- const int right_ext = iclip(x + bw - iw, 0, bw - 1); ++ const int left_ext = iclip((int) -x, 0, (int) bw - 1); ++ const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1); + assert(left_ext + right_ext < bw); +- const int top_ext = iclip(-y, 0, bh - 1); +- const int bottom_ext = iclip(y + bh - ih, 0, bh - 1); ++ const int top_ext = iclip((int) -y, 0, (int) bh - 1); ++ const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1); + assert(top_ext + bottom_ext < bh); + + // copy visible portion first + pixel *blk = dst + top_ext * PXSTRIDE(dst_stride); +- const int center_w = bw - left_ext - right_ext; +- const int center_h = bh - top_ext - bottom_ext; ++ const int center_w = (int) (bw - left_ext - right_ext); ++ const int center_h = (int) (bh - top_ext - bottom_ext); + for (int y = 0; y < center_h; y++) { + pixel_copy(blk + left_ext, ref, center_w); + // extend left edge for this line +diff --git third_party/dav1d/src/meson.build third_party/dav1d/src/meson.build +index 63c0362e1cdf..1feeb13ecf60 100644 +--- third_party/dav1d/src/meson.build ++++ third_party/dav1d/src/meson.build +@@ -1,4 +1,4 @@ +-# Copyright © 2018, VideoLAN and dav1d authors ++# Copyright © 2018-2019, VideoLAN and dav1d authors + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without +@@ -32,6 +32,7 @@ libdav1d_sources = files( + 'cpu.c', + 'data.c', + 'ref.c', ++ 'log.c', + 'getbits.c', + 'obu.c', + 'decode.c', +@@ -84,16 +85,19 @@ if is_asm_enabled + 'arm/cpu.c', + ) + libdav1d_tmpl_sources += files( ++ 'arm/cdef_init_tmpl.c', + 'arm/looprestoration_init_tmpl.c', + 'arm/mc_init_tmpl.c', + ) + if host_machine.cpu_family() == 'aarch64' + libdav1d_sources += files( ++ 'arm/64/cdef.S', + 'arm/64/looprestoration.S', + 'arm/64/mc.S', + ) + elif host_machine.cpu_family().startswith('arm') + libdav1d_sources += files( ++ 'arm/32/looprestoration.S', + 'arm/32/mc.S', + ) + endif +@@ -115,11 +119,13 @@ if is_asm_enabled + # NASM source files + libdav1d_sources_asm = files( + 'x86/cdef.asm', ++ 'x86/cdef_ssse3.asm', + 'x86/cpuid.asm', + 'x86/ipred.asm', + 'x86/itx.asm', + 'x86/loopfilter.asm', + 'x86/looprestoration.asm', ++ 'x86/looprestoration_ssse3.asm', + 'x86/mc.asm', + 'x86/mc_ssse3.asm', + 'x86/itx_ssse3.asm', +@@ -138,13 +144,16 @@ endif + # + + if host_machine.system() == 'windows' and get_option('default_library') != 'static' ++ rc_version_array = meson.project_version().split('.') + winmod = import('windows') + rc_data = configuration_data() +- rc_data.set('VERSION_MAJOR', dav1d_version_major) +- rc_data.set('VERSION_MINOR', dav1d_version_minor) +- rc_data.set('VERSION_REVISION', dav1d_version_revision) +- rc_data.set('VERSION_EXTRA', '0') +- rc_data.set('COPYRIGHT_YEARS', '2018') ++ rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0]) ++ rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1]) ++ rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2]) ++ rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major) ++ rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor) ++ rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision) ++ rc_data.set('COPYRIGHT_YEARS', '2019') + + rc_file = configure_file( + input : 'dav1d.rc.in', +@@ -167,7 +176,7 @@ endif + # Helper library for dav1d entrypoints + libdav1d_entrypoints_objs = static_library('dav1d_entrypoint', + libdav1d_entrypoints_sources, +- rev_target, ++ rev_target, config_h_target, + + include_directories : dav1d_inc_dirs, + dependencies: [stdatomic_dependency], +@@ -194,7 +203,7 @@ endforeach + if host_machine.system() == 'windows' + dav1d_soversion = '' + else +- dav1d_soversion = dav1d_version_major ++ dav1d_soversion = dav1d_api_version_major + endif + + libdav1d = library('dav1d', +@@ -214,7 +223,7 @@ libdav1d = library('dav1d', + thread_compat_dep, + ], + c_args : [stackalign_flag], +- version : meson.project_version(), ++ version : dav1d_soname_version, + soversion : dav1d_soversion, + install : true, + ) +diff --git third_party/dav1d/src/msac.c third_party/dav1d/src/msac.c +index 9376d057c07c..9e6d32b1b6d6 100644 +--- third_party/dav1d/src/msac.c ++++ third_party/dav1d/src/msac.c +@@ -34,6 +34,7 @@ + + #include "src/msac.h" + ++#define EC_PROB_SHIFT 6 + #define EC_MIN_PROB 4 // must be <= (1<rng, r = s->rng >> 8; +- const ec_win c = s->dif >> (EC_WIN_SIZE - 16); +- unsigned ret = 0; +- +- assert(!cdf[n_symbols - 1]); +- +- do { +- u = v; +- v = r * (cdf[ret++] >> EC_PROB_SHIFT); +- v >>= 7 - EC_PROB_SHIFT; +- v += EC_MIN_PROB * (n_symbols - ret); +- } while (c < v); +- +- assert(u <= s->rng); +- +- ctx_norm(s, s->dif - (v << (EC_WIN_SIZE - 16)), u - v); +- return ret - 1; ++unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) { ++ ec_win v, vw, dif = s->dif; ++ uint16_t r = s->rng; ++ unsigned ret; ++ assert((dif >> (EC_WIN_SIZE - 16)) < r); ++ // When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can ++ // replace the multiply with a simple shift. ++ v = ((r >> 8) << 7) + EC_MIN_PROB; ++ vw = v << (EC_WIN_SIZE - 16); ++ ret = dif >= vw; ++ dif -= ret*vw; ++ v += ret*(r - 2*v); ++ ctx_norm(s, dif, (unsigned) v); ++ return !ret; + } + + /* Decode a single binary value. + * f: The probability that the bit is one + * Return: The value decoded (0 or 1). */ +-unsigned msac_decode_bool(MsacContext *const s, const unsigned f) { ++unsigned dav1d_msac_decode_bool(MsacContext *const s, const unsigned f) { + ec_win v, vw, dif = s->dif; + uint16_t r = s->rng; + unsigned ret; + assert((dif >> (EC_WIN_SIZE - 16)) < r); +- v = ((r >> 8) * f >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB; ++ v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB; + vw = v << (EC_WIN_SIZE - 16); + ret = dif >= vw; + dif -= ret*vw; + v += ret*(r - 2*v); +- ctx_norm(s, dif, v); ++ ctx_norm(s, dif, (unsigned) v); + return !ret; + } + +-unsigned msac_decode_bools(MsacContext *const c, const unsigned l) { ++unsigned dav1d_msac_decode_bools(MsacContext *const c, const unsigned l) { + int v = 0; + for (int n = (int) l - 1; n >= 0; n--) +- v = (v << 1) | msac_decode_bool(c, EC_BOOL_EPROB); ++ v = (v << 1) | dav1d_msac_decode_bool_equi(c); + return v; + } + +-int msac_decode_subexp(MsacContext *const c, const int ref, +- const int n, const unsigned k) ++int dav1d_msac_decode_subexp(MsacContext *const c, const int ref, ++ const int n, const unsigned k) + { + int i = 0; + int a = 0; + int b = k; + while ((2 << b) < n) { +- if (!msac_decode_bool(c, EC_BOOL_EPROB)) break; ++ if (!dav1d_msac_decode_bool_equi(c)) break; + b = k + i++; + a = (1 << b); + } +- const unsigned v = msac_decode_bools(c, b) + a; ++ const unsigned v = dav1d_msac_decode_bools(c, b) + a; + return ref * 2 <= n ? inv_recenter(ref, v) : + n - 1 - inv_recenter(n - 1 - ref, v); + } + +-int msac_decode_uniform(MsacContext *const c, const unsigned n) { ++int dav1d_msac_decode_uniform(MsacContext *const c, const unsigned n) { + assert(n > 0); + const int l = ulog2(n) + 1; + assert(l > 1); + const unsigned m = (1 << l) - n; +- const unsigned v = msac_decode_bools(c, l - 1); +- return v < m ? v : (v << 1) - m + msac_decode_bool(c, EC_BOOL_EPROB); ++ const unsigned v = dav1d_msac_decode_bools(c, l - 1); ++ return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(c); ++} ++ ++/* Decodes a symbol given an inverse cumulative distribution function (CDF) ++ * table in Q15. */ ++static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf, ++ const unsigned n_symbols) ++{ ++ ec_win u, v = s->rng, r = s->rng >> 8; ++ const ec_win c = s->dif >> (EC_WIN_SIZE - 16); ++ unsigned ret = 0; ++ ++ assert(!cdf[n_symbols - 1]); ++ ++ do { ++ u = v; ++ v = r * (cdf[ret++] >> EC_PROB_SHIFT); ++ v >>= 7 - EC_PROB_SHIFT; ++ v += EC_MIN_PROB * (n_symbols - ret); ++ } while (c < v); ++ ++ assert(u <= s->rng); ++ ++ ctx_norm(s, s->dif - (v << (EC_WIN_SIZE - 16)), (unsigned) (u - v)); ++ return ret - 1; + } + + static void update_cdf(uint16_t *const cdf, const unsigned val, +@@ -153,17 +170,20 @@ static void update_cdf(uint16_t *const cdf, const unsigned val, + cdf[n_symbols] = count + (count < 32); + } + +-unsigned msac_decode_symbol_adapt(MsacContext *const c, +- uint16_t *const cdf, const unsigned n_symbols) ++unsigned dav1d_msac_decode_symbol_adapt(MsacContext *const c, ++ uint16_t *const cdf, ++ const unsigned n_symbols) + { +- const unsigned val = msac_decode_symbol(c, cdf, n_symbols); ++ const unsigned val = decode_symbol(c, cdf, n_symbols); + if(c->allow_update_cdf) + update_cdf(cdf, val, n_symbols); + return val; + } + +-unsigned msac_decode_bool_adapt(MsacContext *const c, uint16_t *const cdf) { +- const unsigned bit = msac_decode_bool(c, *cdf >> EC_PROB_SHIFT); ++unsigned dav1d_msac_decode_bool_adapt(MsacContext *const c, ++ uint16_t *const cdf) ++{ ++ const unsigned bit = dav1d_msac_decode_bool(c, *cdf); + + if(c->allow_update_cdf){ + // update_cdf() specialized for boolean CDFs +@@ -180,8 +200,8 @@ unsigned msac_decode_bool_adapt(MsacContext *const c, uint16_t *const cdf) { + return bit; + } + +-void msac_init(MsacContext *const s, const uint8_t *const data, +- const size_t sz, const int disable_cdf_update_flag) ++void dav1d_msac_init(MsacContext *const s, const uint8_t *const data, ++ const size_t sz, const int disable_cdf_update_flag) + { + s->buf_pos = data; + s->buf_end = data + sz; +diff --git third_party/dav1d/src/msac.h third_party/dav1d/src/msac.h +index 97bddbb9fa2d..91556fc0639e 100644 +--- third_party/dav1d/src/msac.h ++++ third_party/dav1d/src/msac.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_MSAC_H__ +-#define __DAV1D_SRC_MSAC_H__ ++#ifndef DAV1D_SRC_MSAC_H ++#define DAV1D_SRC_MSAC_H + + #include + #include +@@ -43,18 +43,15 @@ typedef struct MsacContext { + int allow_update_cdf; + } MsacContext; + +-#define EC_PROB_SHIFT 6 +-#define EC_BOOL_EPROB 256 ++void dav1d_msac_init(MsacContext *c, const uint8_t *data, size_t sz, ++ int disable_cdf_update_flag); ++unsigned dav1d_msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf, ++ const unsigned n_symbols); ++unsigned dav1d_msac_decode_bool_equi(MsacContext *const s); ++unsigned dav1d_msac_decode_bool(MsacContext *s, unsigned f); ++unsigned dav1d_msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf); ++unsigned dav1d_msac_decode_bools(MsacContext *c, unsigned l); ++int dav1d_msac_decode_subexp(MsacContext *c, int ref, int n, unsigned k); ++int dav1d_msac_decode_uniform(MsacContext *c, unsigned n); + +-void msac_init(MsacContext *c, const uint8_t *data, size_t sz, int disable_cdf_update_flag); +-unsigned msac_decode_symbol(MsacContext *s, const uint16_t *cdf, +- const unsigned n_symbols); +-unsigned msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf, +- const unsigned n_symbols); +-unsigned msac_decode_bool(MsacContext *s, unsigned f); +-unsigned msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf); +-unsigned msac_decode_bools(MsacContext *c, unsigned l); +-int msac_decode_subexp(MsacContext *c, int ref, int n, unsigned k); +-int msac_decode_uniform(MsacContext *c, unsigned n); +- +-#endif /* __DAV1D_SRC_MSAC_H__ */ ++#endif /* DAV1D_SRC_MSAC_H */ +diff --git third_party/dav1d/src/obu.c third_party/dav1d/src/obu.c +index 66d2e4598c66..74cf61904542 100644 +--- third_party/dav1d/src/obu.c ++++ third_party/dav1d/src/obu.c +@@ -39,6 +39,7 @@ + #include "src/decode.h" + #include "src/getbits.h" + #include "src/levels.h" ++#include "src/log.h" + #include "src/obu.h" + #include "src/ref.h" + #include "src/thread_task.h" +@@ -111,6 +112,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, + for (int i = 0; i < hdr->num_operating_points; i++) { + struct Dav1dSequenceHeaderOperatingPoint *const op = + &hdr->operating_points[i]; ++ struct Dav1dSequenceHeaderOperatingParameterInfo *const opi = ++ &hdr->operating_parameter_info[i]; + op->idc = dav1d_get_bits(gb, 12); + op->major_level = 2 + dav1d_get_bits(gb, 3); + op->minor_level = dav1d_get_bits(gb, 2); +@@ -118,11 +121,11 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, + op->decoder_model_param_present = + hdr->decoder_model_info_present && dav1d_get_bits(gb, 1); + if (op->decoder_model_param_present) { +- op->decoder_buffer_delay = ++ opi->decoder_buffer_delay = + dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); +- op->encoder_buffer_delay = ++ opi->encoder_buffer_delay = + dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); +- op->low_delay_mode = dav1d_get_bits(gb, 1); ++ opi->low_delay_mode = dav1d_get_bits(gb, 1); + } + op->display_model_param_present = + hdr->display_model_info_present && dav1d_get_bits(gb, 1); +@@ -280,7 +283,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, + return 0; + + error: +- fprintf(stderr, "Error parsing sequence header\n"); ++ dav1d_log(c, "Error parsing sequence header\n"); + return -EINVAL; + } + +@@ -917,6 +920,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { + unsigned off_before[2] = { 0xFFFFFFFF, 0xFFFFFFFF }; + int off_after = -1; + int off_before_idx[2], off_after_idx; ++ off_before_idx[0] = 0; + for (int i = 0; i < 7; i++) { + if (!c->refs[hdr->refidx[i]].p.p.data[0]) return -EINVAL; + const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; +@@ -1117,7 +1121,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { + return 0; + + error: +- fprintf(stderr, "Error parsing frame header\n"); ++ dav1d_log(c, "Error parsing frame header\n"); + return -EINVAL; + } + +@@ -1141,11 +1145,12 @@ static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) { + // Check that we haven't read more than obu_len bytes from the buffer + // since init_bit_pos. + static int +-check_for_overrun(GetBits *const gb, unsigned init_bit_pos, unsigned obu_len) ++check_for_overrun(Dav1dContext *const c, GetBits *const gb, ++ unsigned init_bit_pos, unsigned obu_len) + { + // Make sure we haven't actually read past the end of the gb buffer + if (gb->error) { +- fprintf(stderr, "Overrun in OBU bit buffer\n"); ++ dav1d_log(c, "Overrun in OBU bit buffer\n"); + return 1; + } + +@@ -1156,7 +1161,7 @@ check_for_overrun(GetBits *const gb, unsigned init_bit_pos, unsigned obu_len) + assert (init_bit_pos <= pos); + + if (pos - init_bit_pos > 8 * obu_len) { +- fprintf(stderr, "Overrun in OBU bit buffer into next OBU\n"); ++ dav1d_log(c, "Overrun in OBU bit buffer into next OBU\n"); + return 1; + } + +@@ -1184,19 +1189,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { + } + + // obu length field +- unsigned len = 0, more, i = 0; ++ unsigned len = 0; + if (has_length_field) +- do { +- more = dav1d_get_bits(&gb, 1); +- unsigned bits = dav1d_get_bits(&gb, 7); +- if (i <= 3 || (i == 4 && bits < (1 << 4))) +- len |= bits << (i * 7); +- else if (bits) +- goto error; +- if (more && ++i == 8) goto error; +- } while (more); ++ len = dav1d_get_uleb128(&gb); + else +- len = in->sz - 1 - has_extension; ++ len = (int) in->sz - 1 - has_extension; + if (gb.error) goto error; + + const unsigned init_bit_pos = dav1d_get_bits_pos(&gb); +@@ -1237,7 +1234,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { + dav1d_ref_dec(&ref); + return res; + } +- if (check_for_overrun(&gb, init_bit_pos, len)) { ++ if (check_for_overrun(c, &gb, init_bit_pos, len)) { + dav1d_ref_dec(&ref); + return -EINVAL; + } +@@ -1246,8 +1243,14 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { + // previous state. Free that state. + if (!c->seq_hdr) + c->frame_hdr = NULL; +- else if (memcmp(seq_hdr, c->seq_hdr, sizeof(*seq_hdr))) { ++ // see 7.5, operating_parameter_info is allowed to change in ++ // sequence headers of a single sequence ++ else if (memcmp(seq_hdr, c->seq_hdr, offsetof(Dav1dSequenceHeader, operating_parameter_info))) { + c->frame_hdr = NULL; ++ c->mastering_display = NULL; ++ c->content_light = NULL; ++ dav1d_ref_dec(&c->mastering_display_ref); ++ dav1d_ref_dec(&c->content_light_ref); + for (int i = 0; i < 8; i++) { + if (c->refs[i].p.p.data[0]) + dav1d_thread_picture_unref(&c->refs[i].p); +@@ -1290,7 +1293,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { + // This is actually a frame header OBU so read the + // trailing bit and check for overrun. + dav1d_get_bits(&gb, 1); +- if (check_for_overrun(&gb, init_bit_pos, len)) { ++ if (check_for_overrun(c, &gb, init_bit_pos, len)) { + c->frame_hdr = NULL; + return -EINVAL; + } +@@ -1311,11 +1314,18 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { + case OBU_TILE_GRP: { + if (global) break; + if (!c->frame_hdr) goto error; +- if (c->n_tile_data >= 256) goto error; ++ if (c->n_tile_data_alloc < c->n_tile_data + 1) { ++ if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error; ++ struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile)); ++ if (!tile) goto error; ++ c->tile = tile; ++ memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile)); ++ c->n_tile_data_alloc = c->n_tile_data + 1; ++ } + parse_tile_hdr(c, &gb); + // Align to the next byte boundary and check for overrun. + dav1d_bytealign_get_bits(&gb); +- if (check_for_overrun(&gb, init_bit_pos, len)) ++ if (check_for_overrun(c, &gb, init_bit_pos, len)) + return -EINVAL; + // The current bit position is a multiple of 8 (because we + // just aligned it) and less than 8*pkt_bytelen because +@@ -1341,13 +1351,83 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { + c->n_tile_data++; + break; + } ++ case OBU_METADATA: { ++ // obu metadta type field ++ const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb); ++ if (gb.error) goto error; ++ Dav1dRef *ref; ++ Dav1dContentLightLevel *content_light; ++ Dav1dMasteringDisplay *mastering_display; ++ ++ switch (meta_type) { ++ case OBU_META_HDR_CLL: ++ ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel)); ++ if (!ref) return -ENOMEM; ++ content_light = ref->data; ++ memset(content_light, 0, sizeof(*content_light)); ++ ++ content_light->max_content_light_level = dav1d_get_bits(&gb, 16); ++ content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16); ++ ++ // Skip the trailing bit, align to the next byte boundary and check for overrun. ++ dav1d_get_bits(&gb, 1); ++ dav1d_bytealign_get_bits(&gb); ++ if (check_for_overrun(c, &gb, init_bit_pos, len)) { ++ dav1d_ref_dec(&ref); ++ goto error; ++ } ++ ++ dav1d_ref_dec(&c->content_light_ref); ++ c->content_light = content_light; ++ c->content_light_ref = ref; ++ break; ++ case OBU_META_HDR_MDCV: { ++ ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay)); ++ if (!ref) return -ENOMEM; ++ mastering_display = ref->data; ++ memset(mastering_display, 0, sizeof(*mastering_display)); ++ ++ for (int i = 0; i < 3; i++) { ++ mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16); ++ mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16); ++ } ++ mastering_display->white_point[0] = dav1d_get_bits(&gb, 16); ++ mastering_display->white_point[1] = dav1d_get_bits(&gb, 16); ++ ++ mastering_display->max_luminance = dav1d_get_bits(&gb, 32); ++ mastering_display->min_luminance = dav1d_get_bits(&gb, 32); ++ ++ // Skip the trailing bit, align to the next byte boundary and check for overrun. ++ dav1d_get_bits(&gb, 1); ++ dav1d_bytealign_get_bits(&gb); ++ if (check_for_overrun(c, &gb, init_bit_pos, len)) { ++ dav1d_ref_dec(&ref); ++ goto error; ++ } ++ ++ dav1d_ref_dec(&c->mastering_display_ref); ++ c->mastering_display = mastering_display; ++ c->mastering_display_ref = ref; ++ break; ++ } ++ case OBU_META_ITUT_T35: ++ case OBU_META_SCALABILITY: ++ case OBU_META_TIMECODE: ++ // ignore metadata OBUs we don't care about ++ break; ++ default: ++ // print a warning but don't fail for unknown types ++ dav1d_log(c, "Unknown Metadata OBU type %d\n", meta_type); ++ } ++ ++ break; ++ } + case OBU_PADDING: + case OBU_TD: +- case OBU_METADATA: + // ignore OBUs we don't care about + break; + default: +- fprintf(stderr, "Unknown OBU type %d of size %u\n", type, len); ++ dav1d_log(c, "Unknown OBU type %d of size %u\n", type, len); + return -EINVAL; + } + +@@ -1418,6 +1498,6 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { + return len + init_byte_pos; + + error: +- fprintf(stderr, "Error parsing OBU data\n"); ++ dav1d_log(c, "Error parsing OBU data\n"); + return -EINVAL; + } +diff --git third_party/dav1d/src/obu.h third_party/dav1d/src/obu.h +index 32892133b792..aa79b5277a0c 100644 +--- third_party/dav1d/src/obu.h ++++ third_party/dav1d/src/obu.h +@@ -25,12 +25,12 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_OBU_H__ +-#define __DAV1D_SRC_OBU_H__ ++#ifndef DAV1D_SRC_OBU_H ++#define DAV1D_SRC_OBU_H + + #include "dav1d/data.h" + #include "src/internal.h" + + int dav1d_parse_obus(Dav1dContext *c, Dav1dData *in, int global); + +-#endif /* __DAV1D_SRC_OBU_H__ */ ++#endif /* DAV1D_SRC_OBU_H */ +diff --git third_party/dav1d/src/picture.c third_party/dav1d/src/picture.c +index b6eb199b8d04..86fdd9897466 100644 +--- third_party/dav1d/src/picture.c ++++ third_party/dav1d/src/picture.c +@@ -38,6 +38,8 @@ + #include "common/mem.h" + #include "common/validate.h" + ++#include "src/internal.h" ++#include "src/log.h" + #include "src/picture.h" + #include "src/ref.h" + #include "src/thread.h" +@@ -57,11 +59,10 @@ int default_picture_allocator(Dav1dPicture *const p, void *cookie) { + const size_t uv_sz = p->stride[1] * (aligned_h >> ss_ver); + const size_t pic_size = y_sz + 2 * uv_sz; + +- uint8_t *data = dav1d_alloc_aligned(pic_size, 32); ++ uint8_t *data = dav1d_alloc_aligned(pic_size + DAV1D_PICTURE_ALIGNMENT, ++ DAV1D_PICTURE_ALIGNMENT); + if (data == NULL) { +- fprintf(stderr, "Failed to allocate memory of size %zu: %s\n", +- pic_size, strerror(errno)); +- return -1; ++ return -ENOMEM; + } + + p->data[0] = data; +@@ -97,16 +98,18 @@ static void free_buffer(const uint8_t *const data, void *const user_data) { + free(pic_ctx); + } + +-static int picture_alloc_with_edges(Dav1dPicture *const p, ++static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p, + const int w, const int h, + Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref, + Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref, ++ Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref, ++ Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref, + const int bpc, const Dav1dDataProps *props, + Dav1dPicAllocator *const p_allocator, + const size_t extra, void **const extra_ptr) + { + if (p->data[0]) { +- fprintf(stderr, "Picture already allocated!\n"); ++ dav1d_log(c, "Picture already allocated!\n"); + return -1; + } + assert(bpc > 0 && bpc <= 16); +@@ -118,19 +121,17 @@ static int picture_alloc_with_edges(Dav1dPicture *const p, + + p->p.w = w; + p->p.h = h; +- p->m.timestamp = INT64_MIN; +- p->m.duration = 0; +- p->m.offset = -1; +- p->m.user_data.data = NULL; +- p->m.user_data.ref = NULL; + p->seq_hdr = seq_hdr; + p->frame_hdr = frame_hdr; ++ p->content_light = content_light; ++ p->mastering_display = mastering_display; + p->p.layout = seq_hdr->layout; + p->p.bpc = bpc; ++ dav1d_data_props_set_defaults(&p->m); + int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie); + if (res < 0) { + free(pic_ctx); +- return -ENOMEM; ++ return res; + } + + pic_ctx->allocator = *p_allocator; +@@ -139,7 +140,7 @@ static int picture_alloc_with_edges(Dav1dPicture *const p, + if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) { + p_allocator->release_picture_callback(p, p_allocator->cookie); + free(pic_ctx); +- fprintf(stderr, "Failed to wrap picture: %s\n", strerror(errno)); ++ dav1d_log(c, "Failed to wrap picture: %s\n", strerror(errno)); + return -ENOMEM; + } + +@@ -154,43 +155,49 @@ static int picture_alloc_with_edges(Dav1dPicture *const p, + if (extra && extra_ptr) + *extra_ptr = &pic_ctx->extra_ptr; + ++ p->content_light_ref = content_light_ref; ++ if (content_light_ref) dav1d_ref_inc(content_light_ref); ++ ++ p->mastering_display_ref = mastering_display_ref; ++ if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref); ++ + return 0; + } + +-int dav1d_thread_picture_alloc(Dav1dThreadPicture *const p, +- const int w, const int h, +- Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref, +- Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref, +- const int bpc, const Dav1dDataProps *props, +- struct thread_data *const t, const int visible, +- Dav1dPicAllocator *const p_allocator) ++int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f, ++ const int bpc) + { +- p->t = t; ++ Dav1dThreadPicture *const p = &f->sr_cur; ++ p->t = c->n_fc > 1 ? &f->frame_thread.td : NULL; + + const int res = +- picture_alloc_with_edges(&p->p, w, h, +- seq_hdr, seq_hdr_ref, +- frame_hdr, frame_hdr_ref, +- bpc, props, p_allocator, +- t != NULL ? sizeof(atomic_int) * 2 : 0, ++ picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height, ++ f->seq_hdr, f->seq_hdr_ref, ++ f->frame_hdr, f->frame_hdr_ref, ++ c->content_light, c->content_light_ref, ++ c->mastering_display, c->mastering_display_ref, ++ bpc, &f->tile[0].data.m, &c->allocator, ++ p->t != NULL ? sizeof(atomic_int) * 2 : 0, + (void **) &p->progress); + if (res) return res; + +- p->visible = visible; +- if (t) { ++ p->visible = f->frame_hdr->show_frame; ++ if (p->t) { + atomic_init(&p->progress[0], 0); + atomic_init(&p->progress[1], 0); + } + return res; + } + +-int dav1d_picture_alloc_copy(Dav1dPicture *const dst, const int w, ++int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, const int w, + const Dav1dPicture *const src) + { + struct pic_ctx_context *const pic_ctx = src->ref->user_data; +- const int res = picture_alloc_with_edges(dst, w, src->p.h, ++ const int res = picture_alloc_with_edges(c, dst, w, src->p.h, + src->seq_hdr, src->seq_hdr_ref, + src->frame_hdr, src->frame_hdr_ref, ++ src->content_light, src->content_light_ref, ++ src->mastering_display, src->mastering_display_ref, + src->p.bpc, &src->m, &pic_ctx->allocator, + 0, NULL); + return res; +@@ -207,6 +214,8 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) { + if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref); + if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref); + if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref); ++ if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref); ++ if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref); + } + *dst = *src; + } +@@ -241,6 +250,8 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) { + dav1d_ref_dec(&p->seq_hdr_ref); + dav1d_ref_dec(&p->frame_hdr_ref); + dav1d_ref_dec(&p->m.user_data.ref); ++ dav1d_ref_dec(&p->content_light_ref); ++ dav1d_ref_dec(&p->mastering_display_ref); + } + memset(p, 0, sizeof(*p)); + } +diff --git third_party/dav1d/src/picture.h third_party/dav1d/src/picture.h +index 447a81a5027a..22b6f2be5d52 100644 +--- third_party/dav1d/src/picture.h ++++ third_party/dav1d/src/picture.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_PICTURE_H__ +-#define __DAV1D_SRC_PICTURE_H__ ++#ifndef DAV1D_SRC_PICTURE_H ++#define DAV1D_SRC_PICTURE_H + + #include + +@@ -55,12 +55,7 @@ typedef struct Dav1dThreadPicture { + /* + * Allocate a picture with custom border size. + */ +-int dav1d_thread_picture_alloc(Dav1dThreadPicture *p, int w, int h, +- Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref, +- Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref, +- int bpc, const Dav1dDataProps *props, +- struct thread_data *t, int visible, +- Dav1dPicAllocator *); ++int dav1d_thread_picture_alloc(Dav1dContext *c, Dav1dFrameContext *f, const int bpc); + + /** + * Allocate a picture with identical metadata to an existing picture. +@@ -69,7 +64,7 @@ int dav1d_thread_picture_alloc(Dav1dThreadPicture *p, int w, int h, + * For the more typical use case of allocating a new image of the same + * dimensions, use src->p.w as width. + */ +-int dav1d_picture_alloc_copy(Dav1dPicture *dst, const int w, ++int dav1d_picture_alloc_copy(Dav1dContext *c, Dav1dPicture *dst, const int w, + const Dav1dPicture *src); + + /** +@@ -114,4 +109,4 @@ int default_picture_allocator(Dav1dPicture *, void *cookie); + void default_picture_release(Dav1dPicture *, void *cookie); + void dav1d_picture_unref_internal(Dav1dPicture *p); + +-#endif /* __DAV1D_SRC_PICTURE_H__ */ ++#endif /* DAV1D_SRC_PICTURE_H */ +diff --git third_party/dav1d/src/qm.h third_party/dav1d/src/qm.h +index f708e320acf1..23b2348a70cb 100644 +--- third_party/dav1d/src/qm.h ++++ third_party/dav1d/src/qm.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_QM_H__ +-#define __DAV1D_SRC_QM_H__ ++#ifndef DAV1D_SRC_QM_H ++#define DAV1D_SRC_QM_H + + #include "src/levels.h" + +@@ -34,4 +34,4 @@ extern const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES]; + + void dav1d_init_qm_tables(void); + +-#endif /* __DAV1D_SRC_QM_H__ */ ++#endif /* DAV1D_SRC_QM_H */ +diff --git third_party/dav1d/src/recon.h third_party/dav1d/src/recon.h +index 60d777986bbd..f84c8ab31e60 100644 +--- third_party/dav1d/src/recon.h ++++ third_party/dav1d/src/recon.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_RECON_H__ +-#define __DAV1D_SRC_RECON_H__ ++#ifndef DAV1D_SRC_RECON_H ++#define DAV1D_SRC_RECON_H + + #include "src/internal.h" + #include "src/levels.h" +@@ -72,4 +72,4 @@ decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc); + decl_read_coef_blocks_fn(dav1d_read_coef_blocks_8bpc); + decl_read_coef_blocks_fn(dav1d_read_coef_blocks_16bpc); + +-#endif /* __DAV1D_SRC_RECON_H__ */ ++#endif /* DAV1D_SRC_RECON_H */ +diff --git third_party/dav1d/src/recon_tmpl.c third_party/dav1d/src/recon_tmpl.c +index 5fb627dddb17..de2e0d3639d7 100644 +--- third_party/dav1d/src/recon_tmpl.c ++++ third_party/dav1d/src/recon_tmpl.c +@@ -50,8 +50,8 @@ static unsigned read_golomb(MsacContext *const msac) { + int len = 0; + unsigned val = 1; + +- while (!msac_decode_bool(msac, EC_BOOL_EPROB) && len < 32) len++; +- while (len--) val = (val << 1) | msac_decode_bool(msac, EC_BOOL_EPROB); ++ while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++; ++ while (len--) val = (val << 1) | dav1d_msac_decode_bool_equi(msac); + + return val - 1; + } +@@ -73,8 +73,8 @@ static int decode_coefs(Dav1dTileContext *const t, + + // does this block have any non-zero coefficients + const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout); +- const int all_skip = +- msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]); ++ const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac, ++ ts->cdf.coef.skip[t_dim->ctx][sctx]); + if (dbg) + printf("Post-non-zero[%d][%d][%d]: r=%d\n", + t_dim->ctx, sctx, all_skip, ts->msac.rng); +@@ -107,7 +107,7 @@ static int decode_coefs(Dav1dTileContext *const t, + uint16_t *const txtp_cdf = intra ? + ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] : + ts->cdf.m.txtp_inter[set_idx][t_dim->min]; +- idx = msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt); ++ idx = dav1d_msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt); + if (dbg) + printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n", + set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1, +@@ -125,7 +125,7 @@ static int decode_coefs(Dav1dTileContext *const t, + #define case_sz(sz, bin) \ + case sz: { \ + uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \ +- eob_bin = msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \ ++ eob_bin = dav1d_msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \ + break; \ + } + case_sz(0, 16); +@@ -145,14 +145,15 @@ static int decode_coefs(Dav1dTileContext *const t, + eob = 1 << (eob_bin - 1); + uint16_t *const eob_hi_bit_cdf = + ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin]; +- const int eob_hi_bit = msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf); ++ const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, ++ eob_hi_bit_cdf); + if (dbg) + printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n", + t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng); + unsigned mask = eob >> 1; + if (eob_hi_bit) eob |= mask; + for (mask >>= 1; mask; mask >>= 1) { +- const int eob_bit = msac_decode_bool(&ts->msac, EC_BOOL_EPROB); ++ const int eob_bit = dav1d_msac_decode_bool_equi(&ts->msac); + if (eob_bit) eob |= mask; + } + if (dbg) +@@ -178,8 +179,8 @@ static int decode_coefs(Dav1dTileContext *const t, + uint16_t *const lo_cdf = is_last ? + ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] : + ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx]; +- int tok = msac_decode_symbol_adapt(&ts->msac, lo_cdf, +- 4 - is_last) + is_last; ++ int tok = dav1d_msac_decode_symbol_adapt(&ts->msac, lo_cdf, ++ 4 - is_last) + is_last; + if (dbg) + printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", + t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); +@@ -189,8 +190,8 @@ static int decode_coefs(Dav1dTileContext *const t, + if (tok == 3) { + const int br_ctx = get_br_ctx(levels, rc, tx, tx_class); + do { +- const int tok_br = +- msac_decode_symbol_adapt(&ts->msac, br_cdf[br_ctx], 4); ++ const int tok_br = dav1d_msac_decode_symbol_adapt(&ts->msac, ++ br_cdf[br_ctx], 4); + if (dbg) + printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n", + imin(t_dim->ctx, 3), chroma, br_ctx, +@@ -200,7 +201,8 @@ static int decode_coefs(Dav1dTileContext *const t, + } while (tok < 15); + } + +- levels[x * stride + y] = cf[rc] = tok; ++ cf[rc] = tok; ++ levels[x * stride + y] = (uint8_t) cf[rc]; + } + + // residual and sign +@@ -224,14 +226,14 @@ static int decode_coefs(Dav1dTileContext *const t, + const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l); + uint16_t *const dc_sign_cdf = + ts->cdf.coef.dc_sign[chroma][dc_sign_ctx]; +- sign = msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); ++ sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf); + if (dbg) + printf("Post-dc_sign[%d][%d][%d]: r=%d\n", + chroma, dc_sign_ctx, sign, ts->msac.rng); + dc_sign = sign ? 0 : 2; + dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5; + } else { +- sign = msac_decode_bool(&ts->msac, EC_BOOL_EPROB); ++ sign = dav1d_msac_decode_bool_equi(&ts->msac); + if (dbg) + printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng); + dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5; +@@ -559,7 +561,7 @@ static int mc(Dav1dTileContext *const t, + int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor); + #define scale_mv(res, val, scale) do { \ + const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \ +- res = (int)apply_sign64((llabs(tmp) + 128) >> 8, tmp) + 32; \ ++ res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32; \ + } while (0) + int pos_y, pos_x; + scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale); +@@ -833,8 +835,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize + edge_flags, dst, + f->cur.stride[0], top_sb_edge, + b->y_mode, &angle, +- t_dim->w, t_dim->h, edge +- HIGHBD_CALL_SUFFIX); ++ t_dim->w, t_dim->h, ++ f->seq_hdr->intra_edge_filter, ++ edge HIGHBD_CALL_SUFFIX); + dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge, + t_dim->w * 4, t_dim->h * 4, + angle | intra_flags, +@@ -951,9 +954,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize + ts->tiling.row_end >> ss_ver, + 0, uv_dst[pl], stride, + top_sb_edge, DC_PRED, &angle, +- uv_t_dim->w, +- uv_t_dim->h, edge +- HIGHBD_CALL_SUFFIX); ++ uv_t_dim->w, uv_t_dim->h, 0, ++ edge HIGHBD_CALL_SUFFIX); + dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge, + uv_t_dim->w * 4, + uv_t_dim->h * 4, +@@ -1053,8 +1055,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize + edge_flags, dst, stride, + top_sb_edge, uv_mode, + &angle, uv_t_dim->w, +- uv_t_dim->h, edge +- HIGHBD_CALL_SUFFIX); ++ uv_t_dim->h, ++ f->seq_hdr->intra_edge_filter, ++ edge HIGHBD_CALL_SUFFIX); + angle |= intra_edge_filter_flag; + dsp->ipred.intra_pred[m](dst, stride, edge, + uv_t_dim->w * 4, +@@ -1216,7 +1219,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize + t->by, t->by > ts->tiling.row_start, + ts->tiling.col_end, ts->tiling.row_end, + 0, dst, f->cur.stride[0], top_sb_edge, +- m, &angle, bw4, bh4, tl_edge ++ m, &angle, bw4, bh4, 0, tl_edge + HIGHBD_CALL_SUFFIX); + dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel), + tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0 +@@ -1246,7 +1249,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize + // chroma prediction + if (is_sub8x8) { + assert(ss_hor == 1); +- int h_off = 0, v_off = 0; ++ ptrdiff_t h_off = 0, v_off = 0; + if (bw4 == 1 && bh4 == ss_ver) { + for (int pl = 0; pl < 2; pl++) { + res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, +@@ -1358,7 +1361,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize + ts->tiling.row_end >> ss_ver, + 0, uvdst, f->cur.stride[1], + top_sb_edge, m, +- &angle, cbw4, cbh4, tl_edge ++ &angle, cbw4, cbh4, 0, tl_edge + HIGHBD_CALL_SUFFIX); + dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel), + tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0 +@@ -1426,7 +1429,8 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize + if (b->inter_mode == GLOBALMV_GLOBALMV && + imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]]) + { +- res = warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl, ++ res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor, ++ b_dim, 1 + pl, + refp, &f->frame_hdr->gmv[b->ref[i]]); + if (res) return res; + } else { +diff --git third_party/dav1d/src/ref.h third_party/dav1d/src/ref.h +index ebe3ada7bb59..b26c01a96acd 100644 +--- third_party/dav1d/src/ref.h ++++ third_party/dav1d/src/ref.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_REF_H__ +-#define __DAV1D_SRC_REF_H__ ++#ifndef DAV1D_SRC_REF_H ++#define DAV1D_SRC_REF_H + + #include "dav1d/dav1d.h" + +@@ -50,4 +50,4 @@ void dav1d_ref_dec(Dav1dRef **ref); + + int dav1d_ref_is_writable(Dav1dRef *ref); + +-#endif /* __DAV1D_SRC_REF_H__ */ ++#endif /* DAV1D_SRC_REF_H */ +diff --git third_party/dav1d/src/ref_mvs.c third_party/dav1d/src/ref_mvs.c +index 461876709a50..905d843aff44 100644 +--- third_party/dav1d/src/ref_mvs.c ++++ third_party/dav1d/src/ref_mvs.c +@@ -1916,7 +1916,7 @@ void av1_find_ref_mvs(CANDIDATE_MV *mvstack, int *cnt, int_mv (*mvlist)[2], + { + const int bw4 = dav1d_block_dimensions[bs][0]; + const int bh4 = dav1d_block_dimensions[bs][1]; +- int stride = cm->cur_frame.mv_stride; ++ int stride = (int) cm->cur_frame.mv_stride; + MACROBLOCKD xd = (MACROBLOCKD) { + .n8_w = bw4, + .n8_h = bh4, +@@ -2018,7 +2018,7 @@ int av1_init_ref_mv_common(AV1_COMMON *cm, + cm->frame_refs[i].idx = i; + cm->mi_cols = w8 << 1; + cm->mi_rows = h8 << 1; +- cm->mi_stride = stride; ++ cm->mi_stride = (int) stride; + for (int i = 0; i < 7; i++) { + cm->buffer_pool.frame_bufs[i].mi_rows = cm->mi_rows; + cm->buffer_pool.frame_bufs[i].mi_cols = cm->mi_cols; +diff --git third_party/dav1d/src/ref_mvs.h third_party/dav1d/src/ref_mvs.h +index 3ce06b40c2e8..d3d6b33e60af 100644 +--- third_party/dav1d/src/ref_mvs.h ++++ third_party/dav1d/src/ref_mvs.h +@@ -9,8 +9,8 @@ + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +-#ifndef __DAV1D_SRC_REF_MVS_H__ +-#define __DAV1D_SRC_REF_MVS_H__ ++#ifndef DAV1D_SRC_REF_MVS_H ++#define DAV1D_SRC_REF_MVS_H + + #include + +@@ -178,4 +178,4 @@ static inline void fix_mv_precision(const Dav1dFrameHeader *const hdr, + } + } + +-#endif /* __DAV1D_SRC_REF_MVS_H__ */ ++#endif /* DAV1D_SRC_REF_MVS_H */ +diff --git third_party/dav1d/src/scan.h third_party/dav1d/src/scan.h +index 6322f4c71b2f..4bb4118b1318 100644 +--- third_party/dav1d/src/scan.h ++++ third_party/dav1d/src/scan.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SCAN_H__ +-#define __DAV1D_SCAN_H__ ++#ifndef DAV1D_SRC_SCAN_H ++#define DAV1D_SRC_SCAN_H + + #include + +@@ -34,4 +34,4 @@ + + extern const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3]; + +-#endif /* __DAV1D_SCAN_H__ */ ++#endif /* DAV1D_SRC_SCAN_H */ +diff --git third_party/dav1d/src/tables.c third_party/dav1d/src/tables.c +index 3fe46d4f330c..4117a2400602 100644 +--- third_party/dav1d/src/tables.c ++++ third_party/dav1d/src/tables.c +@@ -502,25 +502,25 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1 + { 2, 0, 22, -1 }, + }; + +-const int dav1d_sgr_x_by_xplus1[256] = { +- 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, +- 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, +- 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, +- 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, +- 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, +- 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, +- 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, +- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, +- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, +- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, +- 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, +- 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +- 256, ++const uint8_t dav1d_sgr_x_by_x[256] = { ++ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, ++ 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, ++ 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, ++ 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ++ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ++ 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ++ 0 + }; + + const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = { +@@ -608,108 +608,113 @@ const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = { + } + }; + ++#if ARCH_X86 ++#define W(v0, v1, v2, v3, v4, v5, v6, v7) { v0, v2, v4, v6, v1, v3, v5, v7 } ++#else ++#define W(v0, v1, v2, v3, v4, v5, v6, v7) { v0, v1, v2, v3, v4, v5, v6, v7 } ++#endif + const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = { + // [-1, 0) +- { 0, 127, 0, 0, 0, 1, 0, 0 }, { 0, 127, 0, 0, -1, 2, 0, 0 }, +- { 1, 127, -1, 0, -3, 4, 0, 0 }, { 1, 126, -2, 0, -4, 6, 1, 0 }, +- { 1, 126, -3, 0, -5, 8, 1, 0 }, { 1, 125, -4, 0, -6, 11, 1, 0 }, +- { 1, 124, -4, 0, -7, 13, 1, 0 }, { 2, 123, -5, 0, -8, 15, 1, 0 }, +- { 2, 122, -6, 0, -9, 18, 1, 0 }, { 2, 121, -6, 0, -10, 20, 1, 0 }, +- { 2, 120, -7, 0, -11, 22, 2, 0 }, { 2, 119, -8, 0, -12, 25, 2, 0 }, +- { 3, 117, -8, 0, -13, 27, 2, 0 }, { 3, 116, -9, 0, -13, 29, 2, 0 }, +- { 3, 114, -10, 0, -14, 32, 3, 0 }, { 3, 113, -10, 0, -15, 35, 2, 0 }, +- { 3, 111, -11, 0, -15, 37, 3, 0 }, { 3, 109, -11, 0, -16, 40, 3, 0 }, +- { 3, 108, -12, 0, -16, 42, 3, 0 }, { 4, 106, -13, 0, -17, 45, 3, 0 }, +- { 4, 104, -13, 0, -17, 47, 3, 0 }, { 4, 102, -14, 0, -17, 50, 3, 0 }, +- { 4, 100, -14, 0, -17, 52, 3, 0 }, { 4, 98, -15, 0, -18, 55, 4, 0 }, +- { 4, 96, -15, 0, -18, 58, 3, 0 }, { 4, 94, -16, 0, -18, 60, 4, 0 }, +- { 4, 91, -16, 0, -18, 63, 4, 0 }, { 4, 89, -16, 0, -18, 65, 4, 0 }, +- { 4, 87, -17, 0, -18, 68, 4, 0 }, { 4, 85, -17, 0, -18, 70, 4, 0 }, +- { 4, 82, -17, 0, -18, 73, 4, 0 }, { 4, 80, -17, 0, -18, 75, 4, 0 }, +- { 4, 78, -18, 0, -18, 78, 4, 0 }, { 4, 75, -18, 0, -17, 80, 4, 0 }, +- { 4, 73, -18, 0, -17, 82, 4, 0 }, { 4, 70, -18, 0, -17, 85, 4, 0 }, +- { 4, 68, -18, 0, -17, 87, 4, 0 }, { 4, 65, -18, 0, -16, 89, 4, 0 }, +- { 4, 63, -18, 0, -16, 91, 4, 0 }, { 4, 60, -18, 0, -16, 94, 4, 0 }, +- { 3, 58, -18, 0, -15, 96, 4, 0 }, { 4, 55, -18, 0, -15, 98, 4, 0 }, +- { 3, 52, -17, 0, -14, 100, 4, 0 }, { 3, 50, -17, 0, -14, 102, 4, 0 }, +- { 3, 47, -17, 0, -13, 104, 4, 0 }, { 3, 45, -17, 0, -13, 106, 4, 0 }, +- { 3, 42, -16, 0, -12, 108, 3, 0 }, { 3, 40, -16, 0, -11, 109, 3, 0 }, +- { 3, 37, -15, 0, -11, 111, 3, 0 }, { 2, 35, -15, 0, -10, 113, 3, 0 }, +- { 3, 32, -14, 0, -10, 114, 3, 0 }, { 2, 29, -13, 0, -9, 116, 3, 0 }, +- { 2, 27, -13, 0, -8, 117, 3, 0 }, { 2, 25, -12, 0, -8, 119, 2, 0 }, +- { 2, 22, -11, 0, -7, 120, 2, 0 }, { 1, 20, -10, 0, -6, 121, 2, 0 }, +- { 1, 18, -9, 0, -6, 122, 2, 0 }, { 1, 15, -8, 0, -5, 123, 2, 0 }, +- { 1, 13, -7, 0, -4, 124, 1, 0 }, { 1, 11, -6, 0, -4, 125, 1, 0 }, +- { 1, 8, -5, 0, -3, 126, 1, 0 }, { 1, 6, -4, 0, -2, 126, 1, 0 }, +- { 0, 4, -3, 0, -1, 127, 1, 0 }, { 0, 2, -1, 0, 0, 127, 0, 0 }, ++ W( 0, 0, 127, 1, 0, 0, 0, 0 ), W( 0, - 1, 127, 2, 0, 0, 0, 0 ), ++ W( 1, - 3, 127, 4, - 1, 0, 0, 0 ), W( 1, - 4, 126, 6, - 2, 1, 0, 0 ), ++ W( 1, - 5, 126, 8, - 3, 1, 0, 0 ), W( 1, - 6, 125, 11, - 4, 1, 0, 0 ), ++ W( 1, - 7, 124, 13, - 4, 1, 0, 0 ), W( 2, - 8, 123, 15, - 5, 1, 0, 0 ), ++ W( 2, - 9, 122, 18, - 6, 1, 0, 0 ), W( 2, -10, 121, 20, - 6, 1, 0, 0 ), ++ W( 2, -11, 120, 22, - 7, 2, 0, 0 ), W( 2, -12, 119, 25, - 8, 2, 0, 0 ), ++ W( 3, -13, 117, 27, - 8, 2, 0, 0 ), W( 3, -13, 116, 29, - 9, 2, 0, 0 ), ++ W( 3, -14, 114, 32, -10, 3, 0, 0 ), W( 3, -15, 113, 35, -10, 2, 0, 0 ), ++ W( 3, -15, 111, 37, -11, 3, 0, 0 ), W( 3, -16, 109, 40, -11, 3, 0, 0 ), ++ W( 3, -16, 108, 42, -12, 3, 0, 0 ), W( 4, -17, 106, 45, -13, 3, 0, 0 ), ++ W( 4, -17, 104, 47, -13, 3, 0, 0 ), W( 4, -17, 102, 50, -14, 3, 0, 0 ), ++ W( 4, -17, 100, 52, -14, 3, 0, 0 ), W( 4, -18, 98, 55, -15, 4, 0, 0 ), ++ W( 4, -18, 96, 58, -15, 3, 0, 0 ), W( 4, -18, 94, 60, -16, 4, 0, 0 ), ++ W( 4, -18, 91, 63, -16, 4, 0, 0 ), W( 4, -18, 89, 65, -16, 4, 0, 0 ), ++ W( 4, -18, 87, 68, -17, 4, 0, 0 ), W( 4, -18, 85, 70, -17, 4, 0, 0 ), ++ W( 4, -18, 82, 73, -17, 4, 0, 0 ), W( 4, -18, 80, 75, -17, 4, 0, 0 ), ++ W( 4, -18, 78, 78, -18, 4, 0, 0 ), W( 4, -17, 75, 80, -18, 4, 0, 0 ), ++ W( 4, -17, 73, 82, -18, 4, 0, 0 ), W( 4, -17, 70, 85, -18, 4, 0, 0 ), ++ W( 4, -17, 68, 87, -18, 4, 0, 0 ), W( 4, -16, 65, 89, -18, 4, 0, 0 ), ++ W( 4, -16, 63, 91, -18, 4, 0, 0 ), W( 4, -16, 60, 94, -18, 4, 0, 0 ), ++ W( 3, -15, 58, 96, -18, 4, 0, 0 ), W( 4, -15, 55, 98, -18, 4, 0, 0 ), ++ W( 3, -14, 52, 100, -17, 4, 0, 0 ), W( 3, -14, 50, 102, -17, 4, 0, 0 ), ++ W( 3, -13, 47, 104, -17, 4, 0, 0 ), W( 3, -13, 45, 106, -17, 4, 0, 0 ), ++ W( 3, -12, 42, 108, -16, 3, 0, 0 ), W( 3, -11, 40, 109, -16, 3, 0, 0 ), ++ W( 3, -11, 37, 111, -15, 3, 0, 0 ), W( 2, -10, 35, 113, -15, 3, 0, 0 ), ++ W( 3, -10, 32, 114, -14, 3, 0, 0 ), W( 2, - 9, 29, 116, -13, 3, 0, 0 ), ++ W( 2, - 8, 27, 117, -13, 3, 0, 0 ), W( 2, - 8, 25, 119, -12, 2, 0, 0 ), ++ W( 2, - 7, 22, 120, -11, 2, 0, 0 ), W( 1, - 6, 20, 121, -10, 2, 0, 0 ), ++ W( 1, - 6, 18, 122, - 9, 2, 0, 0 ), W( 1, - 5, 15, 123, - 8, 2, 0, 0 ), ++ W( 1, - 4, 13, 124, - 7, 1, 0, 0 ), W( 1, - 4, 11, 125, - 6, 1, 0, 0 ), ++ W( 1, - 3, 8, 126, - 5, 1, 0, 0 ), W( 1, - 2, 6, 126, - 4, 1, 0, 0 ), ++ W( 0, - 1, 4, 127, - 3, 1, 0, 0 ), W( 0, 0, 2, 127, - 1, 0, 0, 0 ), + // [0, 1) +- { 0, 0, 1, 0, 0, 127, 0, 0 }, { 0, -1, 2, 0, 0, 127, 0, 0 }, +- { 0, -3, 4, 1, 1, 127, -2, 0 }, { 0, -5, 6, 1, 1, 127, -2, 0 }, +- { 0, -6, 8, 1, 2, 126, -3, 0 }, { -1, -7, 11, 2, 2, 126, -4, -1 }, +- { -1, -8, 13, 2, 3, 125, -5, -1 }, { -1, -10, 16, 3, 3, 124, -6, -1 }, +- { -1, -11, 18, 3, 4, 123, -7, -1 }, { -1, -12, 20, 3, 4, 122, -7, -1 }, +- { -1, -13, 23, 3, 4, 121, -8, -1 }, { -2, -14, 25, 4, 5, 120, -9, -1 }, +- { -1, -15, 27, 4, 5, 119, -10, -1 }, { -1, -16, 30, 4, 5, 118, -11, -1 }, +- { -2, -17, 33, 5, 6, 116, -12, -1 }, { -2, -17, 35, 5, 6, 114, -12, -1 }, +- { -2, -18, 38, 5, 6, 113, -13, -1 }, { -2, -19, 41, 6, 7, 111, -14, -2 }, +- { -2, -19, 43, 6, 7, 110, -15, -2 }, { -2, -20, 46, 6, 7, 108, -15, -2 }, +- { -2, -20, 49, 6, 7, 106, -16, -2 }, { -2, -21, 51, 7, 7, 104, -16, -2 }, +- { -2, -21, 54, 7, 7, 102, -17, -2 }, { -2, -21, 56, 7, 8, 100, -18, -2 }, +- { -2, -22, 59, 7, 8, 98, -18, -2 }, { -2, -22, 62, 7, 8, 96, -19, -2 }, +- { -2, -22, 64, 7, 8, 94, -19, -2 }, { -2, -22, 67, 8, 8, 91, -20, -2 }, +- { -2, -22, 69, 8, 8, 89, -20, -2 }, { -2, -22, 72, 8, 8, 87, -21, -2 }, +- { -2, -21, 74, 8, 8, 84, -21, -2 }, { -2, -22, 77, 8, 8, 82, -21, -2 }, +- { -2, -21, 79, 8, 8, 79, -21, -2 }, { -2, -21, 82, 8, 8, 77, -22, -2 }, +- { -2, -21, 84, 8, 8, 74, -21, -2 }, { -2, -21, 87, 8, 8, 72, -22, -2 }, +- { -2, -20, 89, 8, 8, 69, -22, -2 }, { -2, -20, 91, 8, 8, 67, -22, -2 }, +- { -2, -19, 94, 8, 7, 64, -22, -2 }, { -2, -19, 96, 8, 7, 62, -22, -2 }, +- { -2, -18, 98, 8, 7, 59, -22, -2 }, { -2, -18, 100, 8, 7, 56, -21, -2 }, +- { -2, -17, 102, 7, 7, 54, -21, -2 }, { -2, -16, 104, 7, 7, 51, -21, -2 }, +- { -2, -16, 106, 7, 6, 49, -20, -2 }, { -2, -15, 108, 7, 6, 46, -20, -2 }, +- { -2, -15, 110, 7, 6, 43, -19, -2 }, { -2, -14, 111, 7, 6, 41, -19, -2 }, +- { -1, -13, 113, 6, 5, 38, -18, -2 }, { -1, -12, 114, 6, 5, 35, -17, -2 }, +- { -1, -12, 116, 6, 5, 33, -17, -2 }, { -1, -11, 118, 5, 4, 30, -16, -1 }, +- { -1, -10, 119, 5, 4, 27, -15, -1 }, { -1, -9, 120, 5, 4, 25, -14, -2 }, +- { -1, -8, 121, 4, 3, 23, -13, -1 }, { -1, -7, 122, 4, 3, 20, -12, -1 }, +- { -1, -7, 123, 4, 3, 18, -11, -1 }, { -1, -6, 124, 3, 3, 16, -10, -1 }, +- { -1, -5, 125, 3, 2, 13, -8, -1 }, { -1, -4, 126, 2, 2, 11, -7, -1 }, +- { 0, -3, 126, 2, 1, 8, -6, 0 }, { 0, -2, 127, 1, 1, 6, -5, 0 }, +- { 0, -2, 127, 1, 1, 4, -3, 0 }, { 0, 0, 127, 0, 0, 2, -1, 0 }, ++ W( 0, 0, 0, 127, 1, 0, 0, 0),W( 0, 0, -1, 127, 2, 0, 0, 0), ++ W( 0, 1, -3, 127, 4, -2, 1, 0),W( 0, 1, -5, 127, 6, -2, 1, 0), ++ W( 0, 2, -6, 126, 8, -3, 1, 0),W(-1, 2, -7, 126, 11, -4, 2, -1), ++ W(-1, 3, -8, 125, 13, -5, 2, -1),W(-1, 3, -10, 124, 16, -6, 3, -1), ++ W(-1, 4, -11, 123, 18, -7, 3, -1),W(-1, 4, -12, 122, 20, -7, 3, -1), ++ W(-1, 4, -13, 121, 23, -8, 3, -1),W(-2, 5, -14, 120, 25, -9, 4, -1), ++ W(-1, 5, -15, 119, 27, -10, 4, -1),W(-1, 5, -16, 118, 30, -11, 4, -1), ++ W(-2, 6, -17, 116, 33, -12, 5, -1),W(-2, 6, -17, 114, 35, -12, 5, -1), ++ W(-2, 6, -18, 113, 38, -13, 5, -1),W(-2, 7, -19, 111, 41, -14, 6, -2), ++ W(-2, 7, -19, 110, 43, -15, 6, -2),W(-2, 7, -20, 108, 46, -15, 6, -2), ++ W(-2, 7, -20, 106, 49, -16, 6, -2),W(-2, 7, -21, 104, 51, -16, 7, -2), ++ W(-2, 7, -21, 102, 54, -17, 7, -2),W(-2, 8, -21, 100, 56, -18, 7, -2), ++ W(-2, 8, -22, 98, 59, -18, 7, -2),W(-2, 8, -22, 96, 62, -19, 7, -2), ++ W(-2, 8, -22, 94, 64, -19, 7, -2),W(-2, 8, -22, 91, 67, -20, 8, -2), ++ W(-2, 8, -22, 89, 69, -20, 8, -2),W(-2, 8, -22, 87, 72, -21, 8, -2), ++ W(-2, 8, -21, 84, 74, -21, 8, -2),W(-2, 8, -22, 82, 77, -21, 8, -2), ++ W(-2, 8, -21, 79, 79, -21, 8, -2),W(-2, 8, -21, 77, 82, -22, 8, -2), ++ W(-2, 8, -21, 74, 84, -21, 8, -2),W(-2, 8, -21, 72, 87, -22, 8, -2), ++ W(-2, 8, -20, 69, 89, -22, 8, -2),W(-2, 8, -20, 67, 91, -22, 8, -2), ++ W(-2, 7, -19, 64, 94, -22, 8, -2),W(-2, 7, -19, 62, 96, -22, 8, -2), ++ W(-2, 7, -18, 59, 98, -22, 8, -2),W(-2, 7, -18, 56, 100, -21, 8, -2), ++ W(-2, 7, -17, 54, 102, -21, 7, -2),W(-2, 7, -16, 51, 104, -21, 7, -2), ++ W(-2, 6, -16, 49, 106, -20, 7, -2),W(-2, 6, -15, 46, 108, -20, 7, -2), ++ W(-2, 6, -15, 43, 110, -19, 7, -2),W(-2, 6, -14, 41, 111, -19, 7, -2), ++ W(-1, 5, -13, 38, 113, -18, 6, -2),W(-1, 5, -12, 35, 114, -17, 6, -2), ++ W(-1, 5, -12, 33, 116, -17, 6, -2),W(-1, 4, -11, 30, 118, -16, 5, -1), ++ W(-1, 4, -10, 27, 119, -15, 5, -1),W(-1, 4, -9, 25, 120, -14, 5, -2), ++ W(-1, 3, -8, 23, 121, -13, 4, -1),W(-1, 3, -7, 20, 122, -12, 4, -1), ++ W(-1, 3, -7, 18, 123, -11, 4, -1),W(-1, 3, -6, 16, 124, -10, 3, -1), ++ W(-1, 2, -5, 13, 125, -8, 3, -1),W(-1, 2, -4, 11, 126, -7, 2, -1), ++ W( 0, 1, -3, 8, 126, -6, 2, 0),W( 0, 1, -2, 6, 127, -5, 1, 0), ++ W( 0, 1, -2, 4, 127, -3, 1, 0),W( 0, 0, 0, 2, 127, -1, 0, 0), + // [1, 2) +- { 0, 0, 127, 0, 0, 1, 0, 0 }, { 0, 0, 127, 0, 0, -1, 2, 0 }, +- { 0, 1, 127, -1, 0, -3, 4, 0 }, { 0, 1, 126, -2, 0, -4, 6, 1 }, +- { 0, 1, 126, -3, 0, -5, 8, 1 }, { 0, 1, 125, -4, 0, -6, 11, 1 }, +- { 0, 1, 124, -4, 0, -7, 13, 1 }, { 0, 2, 123, -5, 0, -8, 15, 1 }, +- { 0, 2, 122, -6, 0, -9, 18, 1 }, { 0, 2, 121, -6, 0, -10, 20, 1 }, +- { 0, 2, 120, -7, 0, -11, 22, 2 }, { 0, 2, 119, -8, 0, -12, 25, 2 }, +- { 0, 3, 117, -8, 0, -13, 27, 2 }, { 0, 3, 116, -9, 0, -13, 29, 2 }, +- { 0, 3, 114, -10, 0, -14, 32, 3 }, { 0, 3, 113, -10, 0, -15, 35, 2 }, +- { 0, 3, 111, -11, 0, -15, 37, 3 }, { 0, 3, 109, -11, 0, -16, 40, 3 }, +- { 0, 3, 108, -12, 0, -16, 42, 3 }, { 0, 4, 106, -13, 0, -17, 45, 3 }, +- { 0, 4, 104, -13, 0, -17, 47, 3 }, { 0, 4, 102, -14, 0, -17, 50, 3 }, +- { 0, 4, 100, -14, 0, -17, 52, 3 }, { 0, 4, 98, -15, 0, -18, 55, 4 }, +- { 0, 4, 96, -15, 0, -18, 58, 3 }, { 0, 4, 94, -16, 0, -18, 60, 4 }, +- { 0, 4, 91, -16, 0, -18, 63, 4 }, { 0, 4, 89, -16, 0, -18, 65, 4 }, +- { 0, 4, 87, -17, 0, -18, 68, 4 }, { 0, 4, 85, -17, 0, -18, 70, 4 }, +- { 0, 4, 82, -17, 0, -18, 73, 4 }, { 0, 4, 80, -17, 0, -18, 75, 4 }, +- { 0, 4, 78, -18, 0, -18, 78, 4 }, { 0, 4, 75, -18, 0, -17, 80, 4 }, +- { 0, 4, 73, -18, 0, -17, 82, 4 }, { 0, 4, 70, -18, 0, -17, 85, 4 }, +- { 0, 4, 68, -18, 0, -17, 87, 4 }, { 0, 4, 65, -18, 0, -16, 89, 4 }, +- { 0, 4, 63, -18, 0, -16, 91, 4 }, { 0, 4, 60, -18, 0, -16, 94, 4 }, +- { 0, 3, 58, -18, 0, -15, 96, 4 }, { 0, 4, 55, -18, 0, -15, 98, 4 }, +- { 0, 3, 52, -17, 0, -14, 100, 4 }, { 0, 3, 50, -17, 0, -14, 102, 4 }, +- { 0, 3, 47, -17, 0, -13, 104, 4 }, { 0, 3, 45, -17, 0, -13, 106, 4 }, +- { 0, 3, 42, -16, 0, -12, 108, 3 }, { 0, 3, 40, -16, 0, -11, 109, 3 }, +- { 0, 3, 37, -15, 0, -11, 111, 3 }, { 0, 2, 35, -15, 0, -10, 113, 3 }, +- { 0, 3, 32, -14, 0, -10, 114, 3 }, { 0, 2, 29, -13, 0, -9, 116, 3 }, +- { 0, 2, 27, -13, 0, -8, 117, 3 }, { 0, 2, 25, -12, 0, -8, 119, 2 }, +- { 0, 2, 22, -11, 0, -7, 120, 2 }, { 0, 1, 20, -10, 0, -6, 121, 2 }, +- { 0, 1, 18, -9, 0, -6, 122, 2 }, { 0, 1, 15, -8, 0, -5, 123, 2 }, +- { 0, 1, 13, -7, 0, -4, 124, 1 }, { 0, 1, 11, -6, 0, -4, 125, 1 }, +- { 0, 1, 8, -5, 0, -3, 126, 1 }, { 0, 1, 6, -4, 0, -2, 126, 1 }, +- { 0, 0, 4, -3, 0, -1, 127, 1 }, { 0, 0, 2, -1, 0, 0, 127, 0 }, ++ W( 0, 0, 0, 1, 127, 0, 0, 0 ),W( 0, 0, 0, - 1, 127, 2, 0, 0 ), ++ W( 0, 0, 1, - 3, 127, 4, - 1, 0 ), W( 0, 0, 1, - 4, 126, 6, - 2, 1 ), ++ W( 0, 0, 1, - 5, 126, 8, - 3, 1 ), W( 0, 0, 1, - 6, 125, 11, - 4, 1 ), ++ W( 0, 0, 1, - 7, 124, 13, - 4, 1 ), W( 0, 0, 2, - 8, 123, 15, - 5, 1 ), ++ W( 0, 0, 2, - 9, 122, 18, - 6, 1 ), W( 0, 0, 2, -10, 121, 20, - 6, 1 ), ++ W( 0, 0, 2, -11, 120, 22, - 7, 2 ), W( 0, 0, 2, -12, 119, 25, - 8, 2 ), ++ W( 0, 0, 3, -13, 117, 27, - 8, 2 ), W( 0, 0, 3, -13, 116, 29, - 9, 2 ), ++ W( 0, 0, 3, -14, 114, 32, -10, 3 ), W( 0, 0, 3, -15, 113, 35, -10, 2 ), ++ W( 0, 0, 3, -15, 111, 37, -11, 3 ), W( 0, 0, 3, -16, 109, 40, -11, 3 ), ++ W( 0, 0, 3, -16, 108, 42, -12, 3 ), W( 0, 0, 4, -17, 106, 45, -13, 3 ), ++ W( 0, 0, 4, -17, 104, 47, -13, 3 ), W( 0, 0, 4, -17, 102, 50, -14, 3 ), ++ W( 0, 0, 4, -17, 100, 52, -14, 3 ), W( 0, 0, 4, -18, 98, 55, -15, 4 ), ++ W( 0, 0, 4, -18, 96, 58, -15, 3 ), W( 0, 0, 4, -18, 94, 60, -16, 4 ), ++ W( 0, 0, 4, -18, 91, 63, -16, 4 ), W( 0, 0, 4, -18, 89, 65, -16, 4 ), ++ W( 0, 0, 4, -18, 87, 68, -17, 4 ), W( 0, 0, 4, -18, 85, 70, -17, 4 ), ++ W( 0, 0, 4, -18, 82, 73, -17, 4 ), W( 0, 0, 4, -18, 80, 75, -17, 4 ), ++ W( 0, 0, 4, -18, 78, 78, -18, 4 ), W( 0, 0, 4, -17, 75, 80, -18, 4 ), ++ W( 0, 0, 4, -17, 73, 82, -18, 4 ), W( 0, 0, 4, -17, 70, 85, -18, 4 ), ++ W( 0, 0, 4, -17, 68, 87, -18, 4 ), W( 0, 0, 4, -16, 65, 89, -18, 4 ), ++ W( 0, 0, 4, -16, 63, 91, -18, 4 ), W( 0, 0, 4, -16, 60, 94, -18, 4 ), ++ W( 0, 0, 3, -15, 58, 96, -18, 4 ), W( 0, 0, 4, -15, 55, 98, -18, 4 ), ++ W( 0, 0, 3, -14, 52, 100, -17, 4 ), W( 0, 0, 3, -14, 50, 102, -17, 4 ), ++ W( 0, 0, 3, -13, 47, 104, -17, 4 ), W( 0, 0, 3, -13, 45, 106, -17, 4 ), ++ W( 0, 0, 3, -12, 42, 108, -16, 3 ), W( 0, 0, 3, -11, 40, 109, -16, 3 ), ++ W( 0, 0, 3, -11, 37, 111, -15, 3 ), W( 0, 0, 2, -10, 35, 113, -15, 3 ), ++ W( 0, 0, 3, -10, 32, 114, -14, 3 ), W( 0, 0, 2, - 9, 29, 116, -13, 3 ), ++ W( 0, 0, 2, - 8, 27, 117, -13, 3 ), W( 0, 0, 2, - 8, 25, 119, -12, 2 ), ++ W( 0, 0, 2, - 7, 22, 120, -11, 2 ), W( 0, 0, 1, - 6, 20, 121, -10, 2 ), ++ W( 0, 0, 1, - 6, 18, 122, - 9, 2 ), W( 0, 0, 1, - 5, 15, 123, - 8, 2 ), ++ W( 0, 0, 1, - 4, 13, 124, - 7, 1 ), W( 0, 0, 1, - 4, 11, 125, - 6, 1 ), ++ W( 0, 0, 1, - 3, 8, 126, - 5, 1 ), W( 0, 0, 1, - 2, 6, 126, - 4, 1 ), ++ W( 0, 0, 0, - 1, 4, 127, - 3, 1 ), W( 0, 0, 0, 0, 2, 127, - 1, 0 ), + // dummy (replicate row index 191) +- { 0, 0, 2, -1, 0, 0, 127, 0 } ++ W( 0, 0, 0, 0, 2, 127, - 1, 0 ), + }; + + const int16_t dav1d_resize_filter[64][8] = { +diff --git third_party/dav1d/src/tables.h third_party/dav1d/src/tables.h +index c2e6e3609084..d2a046c856ed 100644 +--- third_party/dav1d/src/tables.h ++++ third_party/dav1d/src/tables.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_TABLES_H__ +-#define __DAV1D_SRC_TABLES_H__ ++#ifndef DAV1D_SRC_TABLES_H ++#define DAV1D_SRC_TABLES_H + + #include + +@@ -107,7 +107,7 @@ static const unsigned interintra_allowed_mask = + extern const Dav1dWarpedMotionParams dav1d_default_wm_params; + + extern const int16_t dav1d_sgr_params[16][4]; +-extern const int dav1d_sgr_x_by_xplus1[256]; ++extern const uint8_t dav1d_sgr_x_by_x[256]; + + extern const int8_t dav1d_mc_subpel_filters[5][15][8]; + extern const int8_t dav1d_mc_warp_filter[193][8]; +@@ -121,4 +121,4 @@ extern const uint8_t dav1d_obmc_masks[64]; + + extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs + +-#endif /* __DAV1D_SRC_TABLES_H__ */ ++#endif /* DAV1D_SRC_TABLES_H */ +diff --git third_party/dav1d/src/thread.h third_party/dav1d/src/thread.h +index 01004ea8f7a7..f8da6289d959 100644 +--- third_party/dav1d/src/thread.h ++++ third_party/dav1d/src/thread.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_THREAD_H__ +-# define __DAV1D_THREAD_H__ ++#ifndef DAV1D_SRC_THREAD_H ++#define DAV1D_SRC_THREAD_H + + #if defined(_WIN32) + +@@ -34,62 +34,72 @@ + + #define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT + ++typedef struct { ++ HANDLE h; ++ void *(*func)(void*); ++ void *arg; ++} pthread_t; ++ + typedef SRWLOCK pthread_mutex_t; + typedef CONDITION_VARIABLE pthread_cond_t; + typedef INIT_ONCE pthread_once_t; +-typedef void *pthread_t; +-typedef void *pthread_mutexattr_t; +-typedef void *pthread_condattr_t; +-typedef void *pthread_attr_t; +- +-int dav1d_pthread_create(pthread_t* thread, const pthread_attr_t* attr, +- void*(*proc)(void*), void* param); +-void dav1d_pthread_join(pthread_t thread, void** res); ++ ++int dav1d_pthread_create(pthread_t *thread, const void *attr, ++ void *(*func)(void*), void *arg); ++int dav1d_pthread_join(pthread_t *thread, void **res); + int dav1d_pthread_once(pthread_once_t *once_control, + void (*init_routine)(void)); + + #define pthread_create dav1d_pthread_create +-#define pthread_join dav1d_pthread_join ++#define pthread_join(thread, res) dav1d_pthread_join(&(thread), res) + #define pthread_once dav1d_pthread_once + +-static inline void pthread_mutex_init(pthread_mutex_t* mutex, +- const pthread_mutexattr_t* attr) ++static inline int pthread_mutex_init(pthread_mutex_t *const mutex, ++ const void *const attr) + { +- (void)attr; + InitializeSRWLock(mutex); ++ return 0; + } + +-static inline void pthread_mutex_destroy(pthread_mutex_t* mutex) { +- (void)mutex; ++static inline int pthread_mutex_destroy(pthread_mutex_t *const mutex) { ++ return 0; + } + +-static inline void pthread_mutex_lock(pthread_mutex_t* mutex) { ++static inline int pthread_mutex_lock(pthread_mutex_t *const mutex) { + AcquireSRWLockExclusive(mutex); ++ return 0; + } + +-static inline void pthread_mutex_unlock(pthread_mutex_t* mutex) { ++static inline int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + ReleaseSRWLockExclusive(mutex); ++ return 0; + } + +-static inline void pthread_cond_init(pthread_cond_t* cond, const pthread_condattr_t* attr) { +- (void)attr; ++static inline int pthread_cond_init(pthread_cond_t *const cond, ++ const void *const attr) ++{ + InitializeConditionVariable(cond); ++ return 0; + } + +-static inline void pthread_cond_destroy(pthread_cond_t* cond) { +- (void)cond; ++static inline int pthread_cond_destroy(pthread_cond_t *const cond) { ++ return 0; + } + +-static inline void pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex) { +- SleepConditionVariableSRW(cond, mutex, INFINITE, 0); ++static inline int pthread_cond_wait(pthread_cond_t *const cond, ++ pthread_mutex_t *const mutex) ++{ ++ return !SleepConditionVariableSRW(cond, mutex, INFINITE, 0); + } + +-static inline void pthread_cond_signal(pthread_cond_t* cond) { ++static inline int pthread_cond_signal(pthread_cond_t *const cond) { + WakeConditionVariable(cond); ++ return 0; + } + +-static inline void pthread_cond_broadcast(pthread_cond_t* cond) { ++static inline int pthread_cond_broadcast(pthread_cond_t *const cond) { + WakeAllConditionVariable(cond); ++ return 0; + } + + #else +@@ -98,4 +108,4 @@ static inline void pthread_cond_broadcast(pthread_cond_t* cond) { + + #endif + +-#endif // __DAV1D_THREAD_H__ ++#endif /* DAV1D_SRC_THREAD_H */ +diff --git third_party/dav1d/src/thread_data.h third_party/dav1d/src/thread_data.h +index 7dae029ed20d..62814e63480d 100644 +--- third_party/dav1d/src/thread_data.h ++++ third_party/dav1d/src/thread_data.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_THREAD_DATA_H__ +-#define __DAV1D_SRC_THREAD_DATA_H__ ++#ifndef DAV1D_SRC_THREAD_DATA_H ++#define DAV1D_SRC_THREAD_DATA_H + + #include "src/thread.h" + +@@ -34,6 +34,7 @@ struct thread_data { + pthread_t thread; + pthread_cond_t cond; + pthread_mutex_t lock; ++ int inited; + }; + +-#endif /* __DAV1D_SRC_THREAD_DATA_H__ */ ++#endif /* DAV1D_SRC_THREAD_DATA_H */ +diff --git third_party/dav1d/src/thread_task.c third_party/dav1d/src/thread_task.c +index 7f9ef24e9761..4dc6044c5ed4 100644 +--- third_party/dav1d/src/thread_task.c ++++ third_party/dav1d/src/thread_task.c +@@ -59,7 +59,7 @@ void *dav1d_tile_task(void *const data) { + Dav1dTileContext *const t = data; + struct FrameTileThreadData *const fttd = t->tile_thread.fttd; + const Dav1dFrameContext *const f = t->f; +- const int tile_thread_idx = t - f->tc; ++ const int tile_thread_idx = (int) (t - f->tc); + const uint64_t mask = 1ULL << tile_thread_idx; + + for (;;) { +diff --git third_party/dav1d/src/thread_task.h third_party/dav1d/src/thread_task.h +index 2d27f25a5651..309a714255a2 100644 +--- third_party/dav1d/src/thread_task.h ++++ third_party/dav1d/src/thread_task.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_THREAD_TASK_H__ +-#define __DAV1D_SRC_THREAD_TASK_H__ ++#ifndef DAV1D_SRC_THREAD_TASK_H ++#define DAV1D_SRC_THREAD_TASK_H + + #include + +@@ -41,4 +41,4 @@ void *dav1d_frame_task(void *data); + int dav1d_decode_tile_sbrow(Dav1dTileContext *t); + void *dav1d_tile_task(void *data); + +-#endif /* __DAV1D_SRC_THREAD_TASK_H__ */ ++#endif /* DAV1D_SRC_THREAD_TASK_H */ +diff --git third_party/dav1d/src/warpmv.c third_party/dav1d/src/warpmv.c +index 0666d58bbad8..ae31315aec91 100644 +--- third_party/dav1d/src/warpmv.c ++++ third_party/dav1d/src/warpmv.c +@@ -90,10 +90,10 @@ int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) { + const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]); + const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y; + const int rnd = (1 << shift) >> 1; +- wm->gamma = iclip_wmp(apply_sign64((llabs(v1) + rnd) >> shift, v1)); ++ wm->gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1)); + const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y; + wm->delta = iclip_wmp(mat[5] - +- (int) apply_sign64((llabs(v2) + rnd) >> shift, v2) - ++ apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) - + 0x10000); + + return (4 * abs(wm->alpha) + 7 * abs(wm->beta) >= 0x10000) || +@@ -115,7 +115,9 @@ static int get_mult_shift_ndiag(const int64_t px, + const int idet, const int shift) + { + const int64_t v1 = px * idet; +- const int v2 = apply_sign64((llabs(v1) + ((1LL << shift) >> 1)) >> shift, v1); ++ const int v2 = apply_sign64((int) ((llabs(v1) + ++ ((1LL << shift) >> 1)) >> shift), ++ v1); + return iclip(v2, -0x1fff, 0x1fff); + } + +@@ -123,7 +125,9 @@ static int get_mult_shift_diag(const int64_t px, + const int idet, const int shift) + { + const int64_t v1 = px * idet; +- const int v2 = apply_sign64((llabs(v1) + ((1LL << shift) >> 1)) >> shift, v1); ++ const int v2 = apply_sign64((int) ((llabs(v1) + ++ ((1LL << shift) >> 1)) >> shift), ++ v1); + return iclip(v2, 0xe001, 0x11fff); + } + +diff --git third_party/dav1d/src/warpmv.h third_party/dav1d/src/warpmv.h +index bffb4f24427d..f1da61412395 100644 +--- third_party/dav1d/src/warpmv.h ++++ third_party/dav1d/src/warpmv.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_WARPMV_H__ +-#define __DAV1D_SRC_WARPMV_H__ ++#ifndef DAV1D_SRC_WARPMV_H ++#define DAV1D_SRC_WARPMV_H + + #include "src/levels.h" + +@@ -34,4 +34,4 @@ int dav1d_get_shear_params(Dav1dWarpedMotionParams *wm); + int dav1d_find_affine_int(const int (*pts)[2][2], int np, int bw4, int bh4, + mv mv, Dav1dWarpedMotionParams *wm, int by, int bx); + +-#endif /* __DAV1D_SRC_WARPMV_H__ */ ++#endif /* DAV1D_SRC_WARPMV_H */ +diff --git third_party/dav1d/src/wedge.h third_party/dav1d/src/wedge.h +index 787ff33475e8..45f0570a2700 100644 +--- third_party/dav1d/src/wedge.h ++++ third_party/dav1d/src/wedge.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_WEDGE_H__ +-#define __DAV1D_SRC_WEDGE_H__ ++#ifndef DAV1D_SRC_WEDGE_H ++#define DAV1D_SRC_WEDGE_H + + #include "src/levels.h" + +@@ -38,4 +38,4 @@ void dav1d_init_interintra_masks(void); + extern const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */] + [N_INTER_INTRA_PRED_MODES]; + +-#endif /* __DAV1D_SRC_WEDGE_H__ */ ++#endif /* DAV1D_SRC_WEDGE_H */ +diff --git third_party/dav1d/src/win32/thread.c third_party/dav1d/src/win32/thread.c +index 14140a7ea3ad..c3fb557958a8 100644 +--- third_party/dav1d/src/win32/thread.c ++++ third_party/dav1d/src/win32/thread.c +@@ -29,75 +29,50 @@ + + #if defined(_WIN32) + +-#include + #include + #include + #include + +-#include "config.h" + #include "src/thread.h" + +-typedef struct dav1d_win32_thread_t { +- HANDLE h; +- void* param; +- void*(*proc)(void*); +- void* res; +-} dav1d_win32_thread_t; +- +-static unsigned __stdcall dav1d_thread_entrypoint(void* data) { +- dav1d_win32_thread_t* t = data; +- t->res = t->proc(t->param); ++static unsigned __stdcall thread_entrypoint(void *const data) { ++ pthread_t *const t = data; ++ t->arg = t->func(t->arg); + return 0; + } + +-int dav1d_pthread_create(pthread_t* thread, const pthread_attr_t* attr, +- void*(*proc)(void*), void* param) ++int dav1d_pthread_create(pthread_t *const thread, const void *const attr, ++ void *(*const func)(void*), void *const arg) + { +- dav1d_win32_thread_t* th = *thread = malloc(sizeof(*th)); +- (void)attr; +- if (th == NULL) +- return ENOMEM; +- th->proc = proc; +- th->param = param; +- uintptr_t h = _beginthreadex(NULL, 0, dav1d_thread_entrypoint, th, 0, NULL); +- if ( h == 0 ) { +- int err = errno; +- free(th); +- *thread = NULL; +- return err; +- } +- th->h = (HANDLE)h; +- return 0; ++ thread->func = func; ++ thread->arg = arg; ++ thread->h = (HANDLE)_beginthreadex(NULL, 0, thread_entrypoint, ++ thread, 0, NULL); ++ return !thread->h; + } + +-void dav1d_pthread_join(pthread_t thread, void** res) { +- dav1d_win32_thread_t* th = thread; +- WaitForSingleObject(th->h, INFINITE); ++int dav1d_pthread_join(pthread_t *const thread, void **const res) { ++ if (WaitForSingleObject(thread->h, INFINITE)) ++ return 1; + +- if (res != NULL) +- *res = th->res; +- CloseHandle(th->h); +- free(th); ++ if (res) ++ *res = thread->arg; ++ ++ return !CloseHandle(thread->h); + } + +-int dav1d_pthread_once(pthread_once_t *once_control, +- void (*init_routine)(void)) ++int dav1d_pthread_once(pthread_once_t *const once_control, ++ void (*const init_routine)(void)) + { +- BOOL fPending = FALSE; +- BOOL fStatus; ++ BOOL pending = FALSE; + +- fStatus = InitOnceBeginInitialize(once_control, 0, &fPending, NULL); +- if (fStatus != TRUE) +- return EINVAL; ++ if (InitOnceBeginInitialize(once_control, 0, &pending, NULL) != TRUE) ++ return 1; + +- if (fPending == TRUE) ++ if (pending == TRUE) + init_routine(); + +- fStatus = InitOnceComplete(once_control, 0, NULL); +- if (!fStatus) +- return EINVAL; +- +- return 0; ++ return !InitOnceComplete(once_control, 0, NULL); + } + + #endif +diff --git third_party/dav1d/src/x86/cdef.asm third_party/dav1d/src/x86/cdef.asm +index 6e60cadc31d2..43f6196c631f 100644 +--- third_party/dav1d/src/x86/cdef.asm ++++ third_party/dav1d/src/x86/cdef.asm +@@ -29,15 +29,17 @@ + %if ARCH_X86_64 + + SECTION_RODATA 32 +-pd_04512763: dd 0, 4, 5, 1, 2, 7, 6, 3 ++pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 + div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 + dd 420, 210, 140, 105 +-pd_04261537: dd 0, 4, 2, 6, 1, 5, 3, 7 + shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +-shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + pw_128: times 2 dw 128 + pw_2048: times 2 dw 2048 +-tap_table: dw 4, 2, 3, 3, 2, 1 ++tap_table: ; masks for 8 bit shifts ++ db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 ++ ; weights ++ db 4, 2, 3, 3, 2, 1 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 +@@ -56,67 +58,73 @@ tap_table: dw 4, 2, 3, 3, 2, 1 + + SECTION .text + +-%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride ++%macro ACCUMULATE_TAP 7 ; tap_offset, shift, mask, strength, mul_tap, w, stride + ; load p0/p1 + movsx offq, byte [dirq+kq+%1] ; off1 +-%if %5 == 4 +- movq xm5, [stkq+offq*2+%6*0] ; p0 +- movq xm6, [stkq+offq*2+%6*2] +- movhps xm5, [stkq+offq*2+%6*1] +- movhps xm6, [stkq+offq*2+%6*3] ++%if %6 == 4 ++ movq xm5, [stkq+offq*2+%7*0] ; p0 ++ movq xm6, [stkq+offq*2+%7*2] ++ movhps xm5, [stkq+offq*2+%7*1] ++ movhps xm6, [stkq+offq*2+%7*3] + vinserti128 m5, xm6, 1 + %else +- movu xm5, [stkq+offq*2+%6*0] ; p0 +- vinserti128 m5, [stkq+offq*2+%6*1], 1 ++ movu xm5, [stkq+offq*2+%7*0] ; p0 ++ vinserti128 m5, [stkq+offq*2+%7*1], 1 + %endif + neg offq ; -off1 +-%if %5 == 4 +- movq xm6, [stkq+offq*2+%6*0] ; p1 +- movq xm9, [stkq+offq*2+%6*2] +- movhps xm6, [stkq+offq*2+%6*1] +- movhps xm9, [stkq+offq*2+%6*3] ++%if %6 == 4 ++ movq xm6, [stkq+offq*2+%7*0] ; p1 ++ movq xm9, [stkq+offq*2+%7*2] ++ movhps xm6, [stkq+offq*2+%7*1] ++ movhps xm9, [stkq+offq*2+%7*3] + vinserti128 m6, xm9, 1 + %else +- movu xm6, [stkq+offq*2+%6*0] ; p1 +- vinserti128 m6, [stkq+offq*2+%6*1], 1 ++ movu xm6, [stkq+offq*2+%7*0] ; p1 ++ vinserti128 m6, [stkq+offq*2+%7*1], 1 + %endif +- pcmpeqw m9, m14, m5 +- pcmpeqw m10, m14, m6 +- pandn m9, m5 +- pandn m10, m6 +- pmaxsw m7, m9 ; max after p0 +- pminsw m8, m5 ; min after p0 +- pmaxsw m7, m10 ; max after p1 +- pminsw m8, m6 ; min after p1 ++ ; out of bounds values are set to a value that is a both a large unsigned ++ ; value and a negative signed value. ++ ; use signed max and unsigned min to remove them ++ pmaxsw m7, m5 ; max after p0 ++ pminuw m8, m5 ; min after p0 ++ pmaxsw m7, m6 ; max after p1 ++ pminuw m8, m6 ; min after p1 + + ; accumulate sum[m15] over p0/p1 ++ ; calculate difference before converting + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) +- pabsw m9, m5 +- pabsw m10, m6 +- psraw m11, m9, %2 +- psraw m12, m10, %2 +- psubw m11, %3, m11 +- psubw m12, %3, m12 +- pmaxsw m11, m13 +- pmaxsw m12, m13 +- pminsw m11, m9 +- pminsw m12, m10 +- psignw m11, m5 ; constrain(diff_p0) +- psignw m12, m6 ; constrain(diff_p1) +- pmullw m11, %4 ; constrain(diff_p0) * pri_taps +- pmullw m12, %4 ; constrain(diff_p1) * pri_taps +- paddw m15, m11 +- paddw m15, m12 ++ ++ ; convert to 8-bits with signed saturation ++ ; saturating to large diffs has no impact on the results ++ packsswb m5, m6 ++ ++ ; group into pairs so we can accumulate using maddubsw ++ pshufb m5, m12 ++ pabsb m9, m5 ++ psignb m10, %5, m5 ++ psrlw m5, m9, %2 ; emulate 8-bit shift ++ pand m5, %3 ++ psubusb m5, %4, m5 ++ ++ ; use unsigned min since abs diff can equal 0x80 ++ pminub m5, m9 ++ pmaddubsw m5, m10 ++ paddw m15, m5 + %endmacro + + %macro cdef_filter_fn 3 ; w, h, stride + INIT_YMM avx2 ++%if %1 != 4 || %2 != 8 + cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + dst, stride, left, top, pri, sec, stride3, dst4, edge ++%else ++cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ++ dst, stride, left, top, pri, sec, stride3, dst4, edge ++%endif + %define px rsp+2*16+2*%3 + pcmpeqw m14, m14 +- psrlw m14, 1 ; 0x7fff ++ psllw m14, 15 ; 0x8000 + mov edged, r8m + + ; prepare pixel buffers - body/right +@@ -151,15 +159,17 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + .no_right: + %if %1 == 4 + movd xm1, [dstq+strideq*0] +- movd xm2, [dstq+strideq*2] +- pinsrd xm1, [dstq+strideq*1], 1 +- pinsrd xm2, [dstq+stride3q], 1 ++ movd xm2, [dstq+strideq*1] ++ movd xm3, [dstq+strideq*2] ++ movd xm4, [dstq+stride3q] + pmovzxbw xm1, xm1 + pmovzxbw xm2, xm2 ++ pmovzxbw xm3, xm3 ++ pmovzxbw xm4, xm4 + movq [px+0*%3], xm1 +- movhps [px+1*%3], xm1 +- movq [px+2*%3], xm2 +- movhps [px+3*%3], xm2 ++ movq [px+1*%3], xm2 ++ movq [px+2*%3], xm3 ++ movq [px+3*%3], xm4 + %else + pmovzxbw xm1, [dstq+strideq*0] + pmovzxbw xm2, [dstq+strideq*1] +@@ -175,11 +185,20 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + movd [px+2*%3+%1*2], xm14 + movd [px+3*%3+%1*2], xm14 + %if %2 == 8 +- ; FIXME w == 4 +- movd [px+0*%3+%1*2], xm14 +- movd [px+1*%3+%1*2], xm14 +- movd [px+2*%3+%1*2], xm14 +- movd [px+3*%3+%1*2], xm14 ++ %if %1 == 4 ++ movd xm1, [dst4q+strideq*0] ++ movd xm2, [dst4q+strideq*1] ++ movd xm3, [dst4q+strideq*2] ++ movd xm4, [dst4q+stride3q] ++ pmovzxbw xm1, xm1 ++ pmovzxbw xm2, xm2 ++ pmovzxbw xm3, xm3 ++ pmovzxbw xm4, xm4 ++ movq [px+4*%3], xm1 ++ movq [px+5*%3], xm2 ++ movq [px+6*%3], xm3 ++ movq [px+7*%3], xm4 ++ %else + pmovzxbw xm1, [dst4q+strideq*0] + pmovzxbw xm2, [dst4q+strideq*1] + pmovzxbw xm3, [dst4q+strideq*2] +@@ -188,6 +207,7 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + mova [px+5*%3], xm2 + mova [px+6*%3], xm3 + mova [px+7*%3], xm4 ++ %endif + movd [px+4*%3+%1*2], xm14 + movd [px+5*%3+%1*2], xm14 + movd [px+6*%3+%1*2], xm14 +@@ -258,15 +278,15 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + %if %2 == 8 + pmovzxbw xm2, [leftq+ 8] + %endif +- movd [px+0*32-4], xm1 +- pextrd [px+1*32-4], xm1, 1 +- pextrd [px+2*32-4], xm1, 2 +- pextrd [px+3*32-4], xm1, 3 ++ movd [px+0*%3-4], xm1 ++ pextrd [px+1*%3-4], xm1, 1 ++ pextrd [px+2*%3-4], xm1, 2 ++ pextrd [px+3*%3-4], xm1, 3 + %if %2 == 8 +- movd [px+4*32-4], xm2 +- pextrd [px+5*32-4], xm2, 1 +- pextrd [px+6*32-4], xm2, 2 +- pextrd [px+7*32-4], xm2, 3 ++ movd [px+4*%3-4], xm2 ++ pextrd [px+5*%3-4], xm2, 1 ++ pextrd [px+6*%3-4], xm2, 2 ++ pextrd [px+7*%3-4], xm2, 3 + %endif + jmp .left_done + .no_left: +@@ -344,6 +364,9 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + INIT_YMM avx2 + DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp + %undef edged ++ ; register to shuffle values into after packing ++ vbroadcasti128 m12, [shufb_lohi] ++ + movifnidn prid, prim + movifnidn secd, secm + mov dampingd, r7m +@@ -364,29 +387,37 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + mov [rsp+0], pridmpq ; pri_shift + mov [rsp+8], secdmpq ; sec_shift + ++ DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp ++ lea tableq, [tap_table] ++ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask ++ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ++ + ; pri/sec_taps[k] [4 total] +- DEFINE_ARGS dst, stride, tap, dummy, pri, sec, stride3 ++ DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3 + movd xm0, prid + movd xm1, secd +- vpbroadcastw m0, xm0 ; pri_strength +- vpbroadcastw m1, xm1 ; sec_strength ++ vpbroadcastb m0, xm0 ; pri_strength ++ vpbroadcastb m1, xm1 ; sec_strength + and prid, 1 +- lea tapq, [tap_table] +- lea priq, [tapq+priq*4] ; pri_taps +- lea secq, [tapq+8] ; sec_taps ++ lea priq, [tableq+priq*2+8] ; pri_taps ++ lea secq, [tableq+12] ; sec_taps + + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] +- DEFINE_ARGS dst, stride, tap, dir, pri, sec, stride3 ++ DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3 + mov dird, r6m +- lea tapq, [tapq+dirq*2+12] ++ lea dirq, [tapq+dirq*2+14] + %if %1*%2*2/mmsize > 1 ++ %if %1 == 4 ++ DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k ++ %else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k ++ %endif + mov hd, %1*%2*2/mmsize + %else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k + %endif + lea stkq, [px] +- pxor m13, m13 ++ pxor m11, m11 + %if %1*%2*2/mmsize > 1 + .v_loop: + %endif +@@ -405,20 +436,20 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + mova m7, m4 ; max + mova m8, m4 ; min + .k_loop: +- vpbroadcastw m2, [priq+kq*2] ; pri_taps +- vpbroadcastw m3, [secq+kq*2] ; sec_taps ++ vpbroadcastb m2, [priq+kq] ; pri_taps ++ vpbroadcastb m3, [secq+kq] ; sec_taps + +- ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3 +- ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3 +- ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3 ++ ACCUMULATE_TAP 0*2, [rsp+0], m13, m0, m2, %1, %3 ++ ACCUMULATE_TAP 2*2, [rsp+8], m14, m1, m3, %1, %3 ++ ACCUMULATE_TAP 6*2, [rsp+8], m14, m1, m3, %1, %3 + + dec kq + jge .k_loop + +- vpbroadcastd m12, [pw_2048] +- pcmpgtw m11, m13, m15 +- paddw m15, m11 +- pmulhrsw m15, m12 ++ vpbroadcastd m10, [pw_2048] ++ pcmpgtw m9, m11, m15 ++ paddw m15, m9 ++ pmulhrsw m15, m10 + paddw m4, m15 + pminsw m4, m7 + pmaxsw m4, m8 +@@ -435,8 +466,9 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + %endif + + %if %1*%2*2/mmsize > 1 +- lea dstq, [dstq+strideq*2] +- add stkq, %3*2 ++ %define vloop_lines (mmsize/(%1*2)) ++ lea dstq, [dstq+strideq*vloop_lines] ++ add stkq, %3*vloop_lines + dec hd + jg .v_loop + %endif +@@ -445,6 +477,7 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + %endmacro + + cdef_filter_fn 8, 8, 32 ++cdef_filter_fn 4, 8, 32 + cdef_filter_fn 4, 4, 32 + + INIT_YMM avx2 +@@ -566,9 +599,8 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 + ; and [upper half]: + ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 + ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx +- ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd ++ ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd + +- vbroadcasti128 m14, [shufw_210xxxxx] + pslldq m4, m11, 2 + psrldq m11, 14 + pslldq m5, m12, 4 +@@ -582,7 +614,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 + paddw m11, m13 ; partial_sum_alt[3/2] right + vbroadcasti128 m13, [div_table+32] + paddw m4, m5 ; partial_sum_alt[3/2] left +- pshufb m11, m14 ++ pshuflw m11, m11, q3012 + punpckhwd m6, m4, m11 + punpcklwd m4, m11 + pmaddwd m6, m6 +@@ -597,7 +629,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 + ; and [upper half]: + ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 + ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx +- ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd ++ ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd + + pslldq m5, m1, 2 + psrldq m1, 14 +@@ -610,7 +642,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 + paddw m6, m7 + paddw m1, m3 ; partial_sum_alt[0/1] right + paddw m5, m6 ; partial_sum_alt[0/1] left +- pshufb m1, m14 ++ pshuflw m1, m1, q3012 + punpckhwd m6, m5, m1 + punpcklwd m5, m1 + pmaddwd m6, m6 +@@ -619,54 +651,38 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 + pmulld m5, m13 + paddd m5, m6 ; cost1[a-d] | cost3[a-d] + +- mova xm0, [pd_04512763+ 0] +- mova xm1, [pd_04512763+ 16] ++ mova xm0, [pd_47130256+ 16] ++ mova m1, [pd_47130256] + phaddd m9, m8 + phaddd m5, m4 + phaddd m9, m5 +- vpermd m0, m9 ; cost[0/4/2/6] +- vpermd m1, m9 ; cost[1/5/3/7] +- +- ; now find the best cost, its idx^4 complement, and its idx +- pcmpgtd xm2, xm1, xm0 ; [1/5/3/7] > [0/4/2/6] +- pand xm3, xm2, xm1 +- pandn xm4, xm2, xm0 +- por xm3, xm4 ; higher 4 values +- pshufd xm1, xm1, q2301 +- pshufd xm0, xm0, q2301 +- pand xm1, xm2, xm1 +- pandn xm0, xm2, xm0 +- por xm0, xm1 ; complementary 4 values at idx^4 offset +- pand xm13, xm2, [pd_04261537+16] +- pandn xm14, xm2, [pd_04261537+ 0] +- por xm14, xm13 ; indices +- +- punpckhqdq xm4, xm3, xm0 +- punpcklqdq xm3, xm0 +- pcmpgtd xm5, xm4, xm3 ; [2or3-6or7] > [0or1/4or5] +- punpcklqdq xm5, xm5 +- pand xm6, xm5, xm4 +- pandn xm7, xm5, xm3 +- por xm6, xm7 ; { highest 2 values, complements at idx^4 } +- movhlps xm13, xm14 +- pand xm13, xm5, xm13 +- pandn xm14, xm5, xm14 +- por xm14, xm13 +- +- pshufd xm7, xm6, q3311 +- pcmpgtd xm8, xm7, xm6 ; [4or5or6or7] > [0or1or2or3] +- punpcklqdq xm8, xm8 +- pand xm9, xm8, xm7 +- pandn xm10, xm8, xm6 +- por xm9, xm10 ; max +- movhlps xm10, xm9 ; complement at idx^4 +- psubd xm9, xm10 +- psrld xm9, 10 +- movd [varq], xm9 +- pshufd xm13, xm14, q1111 +- pand xm13, xm8, xm13 +- pandn xm14, xm8, xm14 +- por xm14, xm13 +- movd eax, xm14 ++ vpermd m0, m9 ; cost[0-3] ++ vpermd m1, m9 ; cost[4-7] | cost[0-3] ++ ++ ; now find the best cost ++ pmaxsd xm2, xm0, xm1 ++ pshufd xm3, xm2, q3232 ++ pmaxsd xm2, xm3 ++ pshufd xm3, xm2, q1111 ++ pmaxsd xm2, xm3 ++ pshufd xm2, xm2, q0000 ; best cost ++ ++ ; find the idx using minpos ++ ; make everything other than the best cost negative via subtraction ++ ; find the min of unsigned 16-bit ints to sort out the negative values ++ psubd xm4, xm1, xm2 ++ psubd xm3, xm0, xm2 ++ packssdw xm3, xm4 ++ phminposuw xm3, xm3 ++ ++ ; convert idx to 32-bits ++ psrldq xm3, 2 ++ movd eax, xm3 ++ ++ ; get idx^4 complement ++ vpermd m3, m1 ++ psubd xm2, xm3 ++ psrld xm2, 10 ++ movd [varq], xm2 + RET + %endif ; ARCH_X86_64 +diff --git third_party/dav1d/src/x86/cdef_init_tmpl.c third_party/dav1d/src/x86/cdef_init_tmpl.c +index 93c30a1bcbc1..b96c29988982 100644 +--- third_party/dav1d/src/x86/cdef_init_tmpl.c ++++ third_party/dav1d/src/x86/cdef_init_tmpl.c +@@ -29,18 +29,35 @@ + #include "src/cdef.h" + + decl_cdef_fn(dav1d_cdef_filter_8x8_avx2); ++decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3); ++ ++decl_cdef_fn(dav1d_cdef_filter_4x8_avx2); ++decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3); ++ + decl_cdef_fn(dav1d_cdef_filter_4x4_avx2); ++decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3); + + decl_cdef_dir_fn(dav1d_cdef_dir_avx2); ++decl_cdef_dir_fn(dav1d_cdef_dir_ssse3); + + void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + ++ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; ++ ++#if BITDEPTH ==8 ++ c->dir = dav1d_cdef_dir_ssse3; ++ c->fb[0] = dav1d_cdef_filter_8x8_ssse3; ++ c->fb[1] = dav1d_cdef_filter_4x8_ssse3; ++ c->fb[2] = dav1d_cdef_filter_4x4_ssse3; ++#endif ++ + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + #if BITDEPTH == 8 && ARCH_X86_64 + c->dir = dav1d_cdef_dir_avx2; + c->fb[0] = dav1d_cdef_filter_8x8_avx2; ++ c->fb[1] = dav1d_cdef_filter_4x8_avx2; + c->fb[2] = dav1d_cdef_filter_4x4_avx2; + #endif + } +diff --git third_party/dav1d/src/x86/cdef_ssse3.asm third_party/dav1d/src/x86/cdef_ssse3.asm +new file mode 100644 +index 000000000000..5247c8ebefb6 +--- /dev/null ++++ third_party/dav1d/src/x86/cdef_ssse3.asm +@@ -0,0 +1,1306 @@ ++; Copyright © 2018, VideoLAN and dav1d authors ++; Copyright © 2018, Two Orioles, LLC ++; Copyright © 2019, VideoLabs ++; All rights reserved. ++; ++; Redistribution and use in source and binary forms, with or without ++; modification, are permitted provided that the following conditions are met: ++; ++; 1. Redistributions of source code must retain the above copyright notice, this ++; list of conditions and the following disclaimer. ++; ++; 2. Redistributions in binary form must reproduce the above copyright notice, ++; this list of conditions and the following disclaimer in the documentation ++; and/or other materials provided with the distribution. ++; ++; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++%include "config.asm" ++%include "ext/x86/x86inc.asm" ++ ++SECTION_RODATA 16 ++ ++%if ARCH_X86_32 ++pb_0: times 16 db 0 ++%endif ++pw_128: times 8 dw 128 ++pw_256: times 8 dw 256 ++pw_2048: times 8 dw 2048 ++pw_0x7FFF: times 8 dw 0x7FFF ++pd_0to7: dd 0, 4, 2, 6, 1, 5, 3, 7 ++div_table: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105 ++ dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105 ++shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 ++tap_table: dw 4, 2, 3, 3, 2, 1 ++ db -1 * 16 + 1, -2 * 16 + 2 ++ db 0 * 16 + 1, -1 * 16 + 2 ++ db 0 * 16 + 1, 0 * 16 + 2 ++ db 0 * 16 + 1, 1 * 16 + 2 ++ db 1 * 16 + 1, 2 * 16 + 2 ++ db 1 * 16 + 0, 2 * 16 + 1 ++ db 1 * 16 + 0, 2 * 16 + 0 ++ db 1 * 16 + 0, 2 * 16 - 1 ++ ; the last 6 are repeats of the first 6 so we don't need to & 7 ++ db -1 * 16 + 1, -2 * 16 + 2 ++ db 0 * 16 + 1, -1 * 16 + 2 ++ db 0 * 16 + 1, 0 * 16 + 2 ++ db 0 * 16 + 1, 1 * 16 + 2 ++ db 1 * 16 + 1, 2 * 16 + 2 ++ db 1 * 16 + 0, 2 * 16 + 1 ++ ++SECTION .text ++ ++INIT_XMM ssse3 ++ ++%macro movif32 2 ++ %if ARCH_X86_32 ++ mov %1, %2 ++ %endif ++%endmacro ++ ++%macro SAVE_ARG 2 ; varname, argnum ++ %define %1_stkloc [rsp+%2*gprsize] ++ %define %1_argnum %2 ++ mov r2, r%2m ++ mov %1_stkloc, r2 ++%endmacro ++ ++%macro LOAD_ARG 1-2 0 ; varname, load_to_varname_register ++ %if %2 == 0 ++ mov r %+ %{1}_argnum, %1_stkloc ++ %else ++ mov %1q, %1_stkloc ++ %endif ++%endmacro ++ ++%macro LOAD_ARG32 1-2 ; varname, load_to_varname_register ++ %if ARCH_X86_32 ++ %if %0 == 1 ++ LOAD_ARG %1 ++ %else ++ LOAD_ARG %1, %2 ++ %endif ++ %endif ++%endmacro ++ ++%if ARCH_X86_32 ++ %define PIC_base_offset $$ ++ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) ++%else ++ %define PIC_sym(sym) sym ++%endif ++ ++%macro SAVE_PIC_REG 1 ++ %if ARCH_X86_32 ++ mov [esp+%1], PIC_reg ++ %endif ++%endmacro ++ ++%macro LOAD_PIC_REG 1 ++ %if ARCH_X86_32 ++ mov PIC_reg, [esp+%1] ++ %endif ++%endmacro ++ ++%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride ++ %if ARCH_X86_64 ++ ; load p0/p1 ++ movsx offq, byte [dirq+kq+%1] ; off1 ++ %if %5 == 4 ++ movq m5, [stkq+offq*2+%6*0] ; p0 ++ movhps m5, [stkq+offq*2+%6*1] ++ %else ++ movu m5, [stkq+offq*2+%6*0] ; p0 ++ %endif ++ neg offq ; -off1 ++ %if %5 == 4 ++ movq m6, [stkq+offq*2+%6*0] ; p1 ++ movhps m6, [stkq+offq*2+%6*1] ++ %else ++ movu m6, [stkq+offq*2+%6*0] ; p1 ++ %endif ++ pcmpeqw m9, m14, m5 ++ pcmpeqw m10, m14, m6 ++ pandn m9, m5 ++ pandn m10, m6 ++ pmaxsw m7, m9 ; max after p0 ++ pminsw m8, m5 ; min after p0 ++ pmaxsw m7, m10 ; max after p1 ++ pminsw m8, m6 ; min after p1 ++ ++ ; accumulate sum[m13] over p0/p1 ++ psubw m5, m4 ; diff_p0(p0 - px) ++ psubw m6, m4 ; diff_p1(p1 - px) ++ pabsw m9, m5 ++ pabsw m10, m6 ++ mova m12, m9 ++ psrlw m9, %2 ++ psignw m11, %4, m5 ++ psubusw m5, %3, m9 ++ mova m9, m10 ++ pminsw m5, m12 ; constrain(diff_p0) ++ psrlw m10, %2 ++ psignw m12, %4, m6 ++ psubusw m6, %3, m10 ++ pmullw m5, m11 ; constrain(diff_p0) * taps ++ pminsw m6, m9 ; constrain(diff_p1) ++ pmullw m6, m12 ; constrain(diff_p1) * taps ++ paddw m13, m5 ++ paddw m13, m6 ++ %else ++ ; load p0 ++ movsx offq, byte [dirq+kq+%1] ; off1 ++ %if %5 == 4 ++ movq m5, [stkq+offq*2+%6*0] ; p0 ++ movhps m5, [stkq+offq*2+%6*1] ++ %else ++ movu m5, [stkq+offq*2+%6*0] ; p0 ++ %endif ++ pcmpeqw m3, m5, [PIC_sym(pw_0x7FFF)] ++ pandn m3, m5 ++ pmaxsw m7, m3 ; max after p0 ++ pminsw m8, m5 ; min after p0 ++ ++ ; accumulate sum[m7] over p0 ++ psubw m5, m4 ; diff_p0(p0 - px) ++ psignw m6, %4, m5 ; constrain(diff_p0) ++ pabsw m5, m5 ++ mova m3, m5 ++ psrlw m5, %2 ++ paddsw m5, %3 ++ pandn m5, [PIC_sym(pw_0x7FFF)] ++ pminsw m5, m3 ++ pmullw m5, m6 ; constrain(diff_p0) * taps ++ paddw m13, m5 ++ ++ ; load p1 ++ neg offq ; -off1 ++ %if %5 == 4 ++ movq m5, [stkq+offq*2+%6*0] ; p1 ++ movhps m5, [stkq+offq*2+%6*1] ++ %else ++ movu m5, [stkq+offq*2+%6*0] ; p1 ++ %endif ++ pcmpeqw m3, m5, [PIC_sym(pw_0x7FFF)] ++ pandn m3, m5 ++ pmaxsw m7, m3 ; max after p1 ++ pminsw m8, m5 ; min after p1 ++ ++ ; accumulate sum[m7] over p1 ++ psubw m5, m4 ; diff_p1(p1 - px) ++ psignw m6, %4, m5 ; constrain(diff_p1) ++ pabsw m5, m5 ++ mova m3, m5 ++ psrlw m5, %2 ++ paddsw m5, %3 ++ pandn m5, [PIC_sym(pw_0x7FFF)] ++ pminsw m5, m3 ++ pmullw m5, m6 ; constrain(diff_p1) * taps ++ paddw m13, m5 ++ %endif ++%endmacro ++ ++%macro PMOVZXBW 2-3 0 ; %3 = half ++ %if %3 == 1 ++ movd %1, %2 ++ %else ++ movq %1, %2 ++ %endif ++ punpcklbw %1, m15 ++%endmacro ++ ++%macro LOAD_BODY 4 ; dst, src, block_width, tmp_stride ++ %if %3 == 4 ++ PMOVZXBW m0, [%2+strideq*0] ++ PMOVZXBW m1, [%2+strideq*1] ++ PMOVZXBW m2, [%2+strideq*2] ++ PMOVZXBW m3, [%2+stride3q] ++ %else ++ movu m0, [%2+strideq*0] ++ movu m1, [%2+strideq*1] ++ movu m2, [%2+strideq*2] ++ movu m3, [%2+stride3q] ++ punpckhbw m4, m0, m15 ++ punpcklbw m0, m15 ++ punpckhbw m5, m1, m15 ++ punpcklbw m1, m15 ++ punpckhbw m6, m2, m15 ++ punpcklbw m2, m15 ++ punpckhbw m7, m3, m15 ++ punpcklbw m3, m15 ++ %endif ++ mova [%1+0*%4], m0 ++ mova [%1+1*%4], m1 ++ mova [%1+2*%4], m2 ++ mova [%1+3*%4], m3 ++ %if %3 == 8 ++ mova [%1+0*%4+2*8], m4 ++ mova [%1+1*%4+2*8], m5 ++ mova [%1+2*%4+2*8], m6 ++ mova [%1+3*%4+2*8], m7 ++ %endif ++%endmacro ++ ++%macro cdef_filter_fn 3 ; w, h, stride ++ %if ARCH_X86_64 ++cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*%3, \ ++ dst, stride, left, top, pri, sec, stride3, dst4, edge ++ pcmpeqw m14, m14 ++ psrlw m14, 1 ; 0x7FFF ++ pxor m15, m15 ++ ++ %define px rsp+3*16+2*%3 ++ %else ++cglobal cdef_filter_%1x%2, 2, 7, 8, - 5 * 16 - (%2+4)*%3, \ ++ dst, stride, left, top, stride3, dst4, edge ++ SAVE_ARG left, 2 ++ SAVE_ARG top, 3 ++ SAVE_ARG pri, 4 ++ SAVE_ARG sec, 5 ++ SAVE_ARG dir, 6 ++ SAVE_ARG damping, 7 ++ ++ %define PIC_reg r2 ++ LEA PIC_reg, PIC_base_offset ++ ++ %define m15 [PIC_sym(pb_0)] ++ ++ %define px esp+5*16+2*%3 ++ %endif ++ ++ mov edged, r8m ++ ++ ; prepare pixel buffers - body/right ++ %if %2 == 8 ++ lea dst4q, [dstq+strideq*4] ++ %endif ++ lea stride3q, [strideq*3] ++ test edged, 2 ; have_right ++ jz .no_right ++ LOAD_BODY px, dstq, %1, %3 ++ %if %2 == 8 ++ LOAD_BODY px+4*%3, dst4q, %1, %3 ++ %endif ++ jmp .body_done ++.no_right: ++ PMOVZXBW m0, [dstq+strideq*0], %1 == 4 ++ PMOVZXBW m1, [dstq+strideq*1], %1 == 4 ++ PMOVZXBW m2, [dstq+strideq*2], %1 == 4 ++ PMOVZXBW m3, [dstq+stride3q ], %1 == 4 ++ %if %2 == 8 ++ PMOVZXBW m4, [dst4q+strideq*0], %1 == 4 ++ PMOVZXBW m5, [dst4q+strideq*1], %1 == 4 ++ PMOVZXBW m6, [dst4q+strideq*2], %1 == 4 ++ PMOVZXBW m7, [dst4q+stride3q ], %1 == 4 ++ %endif ++ mova [px+0*%3], m0 ++ mova [px+1*%3], m1 ++ mova [px+2*%3], m2 ++ mova [px+3*%3], m3 ++ %if %2 == 8 ++ mova [px+4*%3], m4 ++ mova [px+5*%3], m5 ++ mova [px+6*%3], m6 ++ mova [px+7*%3], m7 ++ mov dword [px+4*%3+%1*2], 0x7FFF7FFF ++ mov dword [px+5*%3+%1*2], 0x7FFF7FFF ++ mov dword [px+6*%3+%1*2], 0x7FFF7FFF ++ mov dword [px+7*%3+%1*2], 0x7FFF7FFF ++ %endif ++ mov dword [px+0*%3+%1*2], 0x7FFF7FFF ++ mov dword [px+1*%3+%1*2], 0x7FFF7FFF ++ mov dword [px+2*%3+%1*2], 0x7FFF7FFF ++ mov dword [px+3*%3+%1*2], 0x7FFF7FFF ++.body_done: ++ ++ ; top ++ %if ARCH_X86_64 ++ DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge ++ %else ++ DEFINE_ARGS dst, stride, left, top2, stride3, top1, edge ++ %endif ++ LOAD_ARG32 top ++ test edged, 4 ; have_top ++ jz .no_top ++ mov top1q, [top2q+0*gprsize] ++ mov top2q, [top2q+1*gprsize] ++ test edged, 1 ; have_left ++ jz .top_no_left ++ test edged, 2 ; have_right ++ jz .top_no_right ++ %if %1 == 4 ++ PMOVZXBW m0, [top1q-2] ++ PMOVZXBW m1, [top2q-2] ++ %else ++ movu m0, [top1q-4] ++ movu m1, [top2q-4] ++ punpckhbw m2, m0, m15 ++ punpcklbw m0, m15 ++ punpckhbw m3, m1, m15 ++ punpcklbw m1, m15 ++ movu [px-2*%3+8], m2 ++ movu [px-1*%3+8], m3 ++ %endif ++ movu [px-2*%3-%1], m0 ++ movu [px-1*%3-%1], m1 ++ jmp .top_done ++.top_no_right: ++ %if %1 == 4 ++ PMOVZXBW m0, [top1q-%1] ++ PMOVZXBW m1, [top2q-%1] ++ movu [px-2*%3-4*2], m0 ++ movu [px-1*%3-4*2], m1 ++ %else ++ movu m0, [top1q-%1] ++ movu m1, [top2q-%2] ++ punpckhbw m2, m0, m15 ++ punpcklbw m0, m15 ++ punpckhbw m3, m1, m15 ++ punpcklbw m1, m15 ++ mova [px-2*%3-8*2], m0 ++ mova [px-2*%3-0*2], m2 ++ mova [px-1*%3-8*2], m1 ++ mova [px-1*%3-0*2], m3 ++ %endif ++ mov dword [px-2*%3+%1*2], 0x7FFF7FFF ++ mov dword [px-1*%3+%1*2], 0x7FFF7FFF ++ jmp .top_done ++.top_no_left: ++ test edged, 2 ; have_right ++ jz .top_no_left_right ++ %if %1 == 4 ++ PMOVZXBW m0, [top1q] ++ PMOVZXBW m1, [top2q] ++ %else ++ movu m0, [top1q] ++ movu m1, [top2q] ++ punpckhbw m2, m0, m15 ++ punpcklbw m0, m15 ++ punpckhbw m3, m1, m15 ++ punpcklbw m1, m15 ++ movd [px-2*%3+8*2], m2 ++ movd [px-1*%3+8*2], m3 ++ %endif ++ mova [px-2*%3], m0 ++ mova [px-1*%3], m1 ++ mov dword [px-2*%3-4], 0x7FFF7FFF ++ mov dword [px-1*%3-4], 0x7FFF7FFF ++ jmp .top_done ++.top_no_left_right: ++ PMOVZXBW m0, [top1q], %1 == 4 ++ PMOVZXBW m1, [top2q], %1 == 4 ++ mova [px-2*%3], m0 ++ mova [px-1*%3], m1 ++ mov dword [px-2*%3+%1*2], 0x7FFF7FFF ++ mov dword [px-1*%3+%1*2], 0x7FFF7FFF ++ mov dword [px-2*%3-4], 0X7FFF7FFF ++ mov dword [px-1*%3-4], 0X7FFF7FFF ++ jmp .top_done ++.no_top: ++ %if ARCH_X86_64 ++ SWAP m0, m14 ++ %else ++ mova m0, [PIC_sym(pw_0x7FFF)] ++ %endif ++ movu [px-2*%3-4], m0 ++ movu [px-1*%3-4], m0 ++ %if %1 == 8 ++ movq [px-2*%3+12], m0 ++ movq [px-1*%3+12], m0 ++ %endif ++ %if ARCH_X86_64 ++ SWAP m0, m14 ++ %endif ++.top_done: ++ ++ ; left ++ test edged, 1 ; have_left ++ jz .no_left ++ SAVE_PIC_REG 0 ++ LOAD_ARG32 left ++ %if %2 == 4 ++ movq m0, [leftq] ++ %else ++ movu m0, [leftq] ++ %endif ++ LOAD_PIC_REG 0 ++ %if %2 == 4 ++ punpcklbw m0, m15 ++ %else ++ punpckhbw m1, m0, m15 ++ punpcklbw m0, m15 ++ movhlps m3, m1 ++ movd [px+4*%3-4], m1 ++ movd [px+6*%3-4], m3 ++ psrlq m1, 32 ++ psrlq m3, 32 ++ movd [px+5*%3-4], m1 ++ movd [px+7*%3-4], m3 ++ %endif ++ movhlps m2, m0 ++ movd [px+0*%3-4], m0 ++ movd [px+2*%3-4], m2 ++ psrlq m0, 32 ++ psrlq m2, 32 ++ movd [px+1*%3-4], m0 ++ movd [px+3*%3-4], m2 ++ jmp .left_done ++.no_left: ++ mov dword [px+0*%3-4], 0x7FFF7FFF ++ mov dword [px+1*%3-4], 0x7FFF7FFF ++ mov dword [px+2*%3-4], 0x7FFF7FFF ++ mov dword [px+3*%3-4], 0x7FFF7FFF ++ %if %2 == 8 ++ mov dword [px+4*%3-4], 0x7FFF7FFF ++ mov dword [px+5*%3-4], 0x7FFF7FFF ++ mov dword [px+6*%3-4], 0x7FFF7FFF ++ mov dword [px+7*%3-4], 0x7FFF7FFF ++ %endif ++.left_done: ++ ++ ; bottom ++ %if ARCH_X86_64 ++ DEFINE_ARGS dst, stride, dummy1, dst8, pri, sec, stride3, dummy2, edge ++ %else ++ DEFINE_ARGS dst, stride, dummy1, dst8, stride3, dummy2, edge ++ %endif ++ test edged, 8 ; have_bottom ++ jz .no_bottom ++ lea dst8q, [dstq+%2*strideq] ++ test edged, 1 ; have_left ++ jz .bottom_no_left ++ test edged, 2 ; have_right ++ jz .bottom_no_right ++ %if %1 == 4 ++ PMOVZXBW m0, [dst8q-(%1/2)] ++ PMOVZXBW m1, [dst8q+strideq-(%1/2)] ++ %else ++ movu m0, [dst8q-4] ++ movu m1, [dst8q+strideq-4] ++ punpckhbw m2, m0, m15 ++ punpcklbw m0, m15 ++ punpckhbw m3, m1, m15 ++ punpcklbw m1, m15 ++ movu [px+(%2+0)*%3+8], m2 ++ movu [px+(%2+1)*%3+8], m3 ++ %endif ++ movu [px+(%2+0)*%3-%1], m0 ++ movu [px+(%2+1)*%3-%1], m1 ++ jmp .bottom_done ++.bottom_no_right: ++ %if %1 == 4 ++ PMOVZXBW m0, [dst8q-4] ++ PMOVZXBW m1, [dst8q+strideq-4] ++ movu [px+(%2+0)*%3-4*2], m0 ++ movu [px+(%2+1)*%3-4*2], m1 ++ %else ++ movu m0, [dst8q-8] ++ movu m1, [dst8q+strideq-8] ++ punpckhbw m2, m0, m15 ++ punpcklbw m0, m15 ++ punpckhbw m3, m1, m15 ++ punpcklbw m1, m15 ++ mova [px+(%2+0)*%3-8*2], m0 ++ mova [px+(%2+0)*%3-0*2], m2 ++ mova [px+(%2+1)*%3-8*2], m1 ++ mova [px+(%2+1)*%3-0*2], m3 ++ mov dword [px+(%2-1)*%3+8*2], 0x7FFF7FFF ; overwritten by first mova ++ %endif ++ mov dword [px+(%2+0)*%3+%1*2], 0x7FFF7FFF ++ mov dword [px+(%2+1)*%3+%1*2], 0x7FFF7FFF ++ jmp .bottom_done ++.bottom_no_left: ++ test edged, 2 ; have_right ++ jz .bottom_no_left_right ++ %if %1 == 4 ++ PMOVZXBW m0, [dst8q] ++ PMOVZXBW m1, [dst8q+strideq] ++ %else ++ movu m0, [dst8q] ++ movu m1, [dst8q+strideq] ++ punpckhbw m2, m0, m15 ++ punpcklbw m0, m15 ++ punpckhbw m3, m1, m15 ++ punpcklbw m1, m15 ++ mova [px+(%2+0)*%3+8*2], m2 ++ mova [px+(%2+1)*%3+8*2], m3 ++ %endif ++ mova [px+(%2+0)*%3], m0 ++ mova [px+(%2+1)*%3], m1 ++ mov dword [px+(%2+0)*%3-4], 0x7FFF7FFF ++ mov dword [px+(%2+1)*%3-4], 0x7FFF7FFF ++ jmp .bottom_done ++.bottom_no_left_right: ++ PMOVZXBW m0, [dst8q+strideq*0], %1 == 4 ++ PMOVZXBW m1, [dst8q+strideq*1], %1 == 4 ++ mova [px+(%2+0)*%3], m0 ++ mova [px+(%2+1)*%3], m1 ++ mov dword [px+(%2+0)*%3+%1*2], 0x7FFF7FFF ++ mov dword [px+(%2+1)*%3+%1*2], 0x7FFF7FFF ++ mov dword [px+(%2+0)*%3-4], 0x7FFF7FFF ++ mov dword [px+(%2+1)*%3-4], 0x7FFF7FFF ++ jmp .bottom_done ++.no_bottom: ++ %if ARCH_X86_64 ++ SWAP m0, m14 ++ %else ++ mova m0, [PIC_sym(pw_0x7FFF)] ++ %endif ++ movu [px+(%2+0)*%3-4], m0 ++ movu [px+(%2+1)*%3-4], m0 ++ %if %1 == 8 ++ movq [px+(%2+0)*%3+12], m0 ++ movq [px+(%2+1)*%3+12], m0 ++ %endif ++ %if ARCH_X86_64 ++ SWAP m0, m14 ++ %endif ++.bottom_done: ++ ++ ; actual filter ++ DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp ++ %if ARCH_X86_64 ++ movifnidn prid, prim ++ movifnidn secd, secm ++ mov dampingd, r7m ++ %else ++ LOAD_ARG pri ++ LOAD_ARG sec ++ LOAD_ARG damping, 1 ++ %endif ++ ++ SAVE_PIC_REG 8 ++ mov pridmpd, prid ++ mov secdmpd, secd ++ or pridmpd, 1 ++ or secdmpd, 1 ++ bsr pridmpd, pridmpd ++ bsr secdmpd, secdmpd ++ sub pridmpd, dampingd ++ sub secdmpd, dampingd ++ xor dampingd, dampingd ++ neg pridmpd ++ cmovl pridmpd, dampingd ++ neg secdmpd ++ cmovl secdmpd, dampingd ++ mov [rsp+ 0], pridmpq ; pri_shift ++ mov [rsp+16], secdmpq ; sec_shift ++ %if ARCH_X86_32 ++ mov dword [esp+ 4], 0 ; zero upper 32 bits of psraw ++ mov dword [esp+20], 0 ; source operand in ACCUMULATE_TAP ++ %define PIC_reg r6 ++ LOAD_PIC_REG 8 ++ %endif ++ ++ ; pri/sec_taps[k] [4 total] ++ DEFINE_ARGS dst, stride, tap, dummy, pri, sec ++ %if ARCH_X86_64 ++ mova m14, [pw_256] ++ %else ++ %define m14 [PIC_sym(pw_256)] ++ %endif ++ movd m0, prid ++ movd m1, secd ++ pshufb m0, m14 ++ pshufb m1, m14 ++ %if ARCH_X86_32 ++ mova m2, [PIC_sym(pw_0x7FFF)] ++ pandn m0, m2 ++ pandn m1, m2 ++ mova [esp+0x20], m0 ++ mova [esp+0x30], m1 ++ %endif ++ and prid, 1 ++ lea tapq, [PIC_sym(tap_table)] ++ lea priq, [tapq+priq*4] ; pri_taps ++ lea secq, [tapq+8] ; sec_taps ++ ++ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] ++ DEFINE_ARGS dst, stride, tap, dir, pri, sec ++ %if ARCH_X86_64 ++ mov dird, r6m ++ lea tapq, [tapq+dirq*2+12] ++ DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k ++ %else ++ LOAD_ARG dir, 1 ++ lea tapd, [tapd+dird*2+12] ++ DEFINE_ARGS dst, stride, dir, stk, pri, sec ++ %define hd dword [esp+8] ++ %define offq dstq ++ %define kq strideq ++ %endif ++ mov hd, %1*%2*2/mmsize ++ lea stkq, [px] ++ movif32 [esp+0x1C], strided ++.v_loop: ++ movif32 [esp+0x18], dstd ++ mov kq, 1 ++ %if %1 == 4 ++ movq m4, [stkq+%3*0] ++ movhps m4, [stkq+%3*1] ++ %else ++ mova m4, [stkq+%3*0] ; px ++ %endif ++ ++ %if ARCH_X86_32 ++ %xdefine m11 m6 ++ %xdefine m13 m7 ++ %xdefine m7 m0 ++ %xdefine m8 m1 ++ %endif ++ ++ pxor m13, m13 ; sum ++ mova m7, m4 ; max ++ mova m8, m4 ; min ++.k_loop: ++ %if ARCH_X86_64 ++ movd m2, [priq+kq*2] ; pri_taps ++ movd m3, [secq+kq*2] ; sec_taps ++ pshufb m2, m14 ++ pshufb m3, m14 ++ ACCUMULATE_TAP 0*2, [rsp+ 0], m0, m2, %1, %3 ++ ACCUMULATE_TAP 2*2, [rsp+16], m1, m3, %1, %3 ++ ACCUMULATE_TAP 6*2, [rsp+16], m1, m3, %1, %3 ++ %else ++ movd m2, [priq+kq*2] ; pri_taps ++ pshufb m2, m14 ++ ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x20], m2, %1, %3 ++ ++ movd m2, [secq+kq*2] ; sec_taps ++ pshufb m2, m14 ++ ACCUMULATE_TAP 2*2, [esp+0x10], [esp+0x30], m2, %1, %3 ++ ACCUMULATE_TAP 6*2, [esp+0x10], [esp+0x30], m2, %1, %3 ++ %endif ++ ++ dec kq ++ jge .k_loop ++ ++ pcmpgtw m11, m15, m13 ++ paddw m13, m11 ++ pmulhrsw m13, [PIC_sym(pw_2048)] ++ paddw m4, m13 ++ pminsw m4, m7 ++ pmaxsw m4, m8 ++ packuswb m4, m4 ++ movif32 dstd, [esp+0x18] ++ movif32 strided, [esp+0x1C] ++ %if %1 == 4 ++ movd [dstq+strideq*0], m4 ++ psrlq m4, 32 ++ movd [dstq+strideq*1], m4 ++ %else ++ movq [dstq], m4 ++ %endif ++ ++ %if %1 == 4 ++ %define vloop_lines (mmsize/(%1*2)) ++ lea dstq, [dstq+strideq*vloop_lines] ++ add stkq, %3*vloop_lines ++ %else ++ lea dstq, [dstq+strideq] ++ add stkq, %3 ++ %endif ++ dec hd ++ jg .v_loop ++ ++ RET ++%endmacro ++ ++cdef_filter_fn 8, 8, 32 ++cdef_filter_fn 4, 8, 32 ++cdef_filter_fn 4, 4, 32 ++ ++%macro MULLD 2 ++ %if ARCH_X86_32 ++ %define m15 m1 ++ %endif ++ pmulhuw m15, %1, %2 ++ pmullw %1, %2 ++ pslld m15, 16 ++ paddd %1, m15 ++%endmacro ++ ++%if ARCH_X86_64 ++cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3 ++ lea stride3q, [strideq*3] ++ movq m1, [srcq+strideq*0] ++ movhps m1, [srcq+strideq*1] ++ movq m3, [srcq+strideq*2] ++ movhps m3, [srcq+stride3q] ++ lea srcq, [srcq+strideq*4] ++ movq m5, [srcq+strideq*0] ++ movhps m5, [srcq+strideq*1] ++ movq m7, [srcq+strideq*2] ++ movhps m7, [srcq+stride3q] ++ ++ pxor m8, m8 ++ psadbw m0, m1, m8 ++ psadbw m2, m3, m8 ++ psadbw m4, m5, m8 ++ psadbw m6, m7, m8 ++ packssdw m0, m2 ++ packssdw m4, m6 ++ packssdw m0, m4 ++ SWAP m0, m9 ++ ++ punpcklbw m0, m1, m8 ++ punpckhbw m1, m8 ++ punpcklbw m2, m3, m8 ++ punpckhbw m3, m8 ++ punpcklbw m4, m5, m8 ++ punpckhbw m5, m8 ++ punpcklbw m6, m7, m8 ++ punpckhbw m7, m8 ++ ++ mova m8, [pw_128] ++ psubw m0, m8 ++ psubw m1, m8 ++ psubw m2, m8 ++ psubw m3, m8 ++ psubw m4, m8 ++ psubw m5, m8 ++ psubw m6, m8 ++ psubw m7, m8 ++ psllw m8, 3 ++ psubw m9, m8 ; partial_sum_hv[0] ++ ++ paddw m8, m0, m1 ++ paddw m10, m2, m3 ++ paddw m8, m4 ++ paddw m10, m5 ++ paddw m8, m6 ++ paddw m10, m7 ++ paddw m8, m10 ; partial_sum_hv[1] ++ ++ pmaddwd m8, m8 ++ pmaddwd m9, m9 ++ phaddd m9, m8 ++ SWAP m8, m9 ++ MULLD m8, [div_table+48] ++ ++ pslldq m9, m1, 2 ++ psrldq m10, m1, 14 ++ pslldq m11, m2, 4 ++ psrldq m12, m2, 12 ++ pslldq m13, m3, 6 ++ psrldq m14, m3, 10 ++ paddw m9, m0 ++ paddw m10, m12 ++ paddw m11, m13 ++ paddw m10, m14 ; partial_sum_diag[0] top/right half ++ paddw m9, m11 ; partial_sum_diag[0] top/left half ++ pslldq m11, m4, 8 ++ psrldq m12, m4, 8 ++ pslldq m13, m5, 10 ++ psrldq m14, m5, 6 ++ paddw m9, m11 ++ paddw m10, m12 ++ paddw m9, m13 ++ paddw m10, m14 ++ pslldq m11, m6, 12 ++ psrldq m12, m6, 4 ++ pslldq m13, m7, 14 ++ psrldq m14, m7, 2 ++ paddw m9, m11 ++ paddw m10, m12 ++ paddw m9, m13 ; partial_sum_diag[0][0-7] ++ paddw m10, m14 ; partial_sum_diag[0][8-14,zero] ++ pshufb m10, [shufw_6543210x] ++ punpckhwd m11, m9, m10 ++ punpcklwd m9, m10 ++ pmaddwd m11, m11 ++ pmaddwd m9, m9 ++ MULLD m11, [div_table+16] ++ MULLD m9, [div_table+0] ++ paddd m9, m11 ; cost[0a-d] ++ ++ pslldq m10, m0, 14 ++ psrldq m11, m0, 2 ++ pslldq m12, m1, 12 ++ psrldq m13, m1, 4 ++ pslldq m14, m2, 10 ++ psrldq m15, m2, 6 ++ paddw m10, m12 ++ paddw m11, m13 ++ paddw m10, m14 ++ paddw m11, m15 ++ pslldq m12, m3, 8 ++ psrldq m13, m3, 8 ++ pslldq m14, m4, 6 ++ psrldq m15, m4, 10 ++ paddw m10, m12 ++ paddw m11, m13 ++ paddw m10, m14 ++ paddw m11, m15 ++ pslldq m12, m5, 4 ++ psrldq m13, m5, 12 ++ pslldq m14, m6, 2 ++ psrldq m15, m6, 14 ++ paddw m10, m12 ++ paddw m11, m13 ++ paddw m10, m14 ++ paddw m11, m15 ; partial_sum_diag[1][8-14,zero] ++ paddw m10, m7 ; partial_sum_diag[1][0-7] ++ pshufb m11, [shufw_6543210x] ++ punpckhwd m12, m10, m11 ++ punpcklwd m10, m11 ++ pmaddwd m12, m12 ++ pmaddwd m10, m10 ++ MULLD m12, [div_table+16] ++ MULLD m10, [div_table+0] ++ paddd m10, m12 ; cost[4a-d] ++ phaddd m9, m10 ; cost[0a/b,4a/b] ++ ++ paddw m10, m0, m1 ++ paddw m11, m2, m3 ++ paddw m12, m4, m5 ++ paddw m13, m6, m7 ++ phaddw m0, m4 ++ phaddw m1, m5 ++ phaddw m2, m6 ++ phaddw m3, m7 ++ ++ ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) ++ pslldq m4, m11, 2 ++ psrldq m5, m11, 14 ++ pslldq m6, m12, 4 ++ psrldq m7, m12, 12 ++ pslldq m14, m13, 6 ++ psrldq m15, m13, 10 ++ paddw m4, m10 ++ paddw m5, m7 ++ paddw m4, m6 ++ paddw m5, m15 ; partial_sum_alt[3] right ++ paddw m4, m14 ; partial_sum_alt[3] left ++ pshuflw m5, m5, q3012 ++ punpckhwd m6, m4, m5 ++ punpcklwd m4, m5 ++ pmaddwd m6, m6 ++ pmaddwd m4, m4 ++ MULLD m6, [div_table+48] ++ MULLD m4, [div_table+32] ++ paddd m4, m6 ; cost[7a-d] ++ ++ pslldq m5, m10, 6 ++ psrldq m6, m10, 10 ++ pslldq m7, m11, 4 ++ psrldq m10, m11, 12 ++ pslldq m11, m12, 2 ++ psrldq m12, 14 ++ paddw m5, m7 ++ paddw m6, m10 ++ paddw m5, m11 ++ paddw m6, m12 ++ paddw m5, m13 ++ pshuflw m6, m6, q3012 ++ punpckhwd m7, m5, m6 ++ punpcklwd m5, m6 ++ pmaddwd m7, m7 ++ pmaddwd m5, m5 ++ MULLD m7, [div_table+48] ++ MULLD m5, [div_table+32] ++ paddd m5, m7 ; cost[5a-d] ++ ++ pslldq m6, m1, 2 ++ psrldq m7, m1, 14 ++ pslldq m10, m2, 4 ++ psrldq m11, m2, 12 ++ pslldq m12, m3, 6 ++ psrldq m13, m3, 10 ++ paddw m6, m0 ++ paddw m7, m11 ++ paddw m6, m10 ++ paddw m7, m13 ; partial_sum_alt[3] right ++ paddw m6, m12 ; partial_sum_alt[3] left ++ pshuflw m7, m7, q3012 ++ punpckhwd m10, m6, m7 ++ punpcklwd m6, m7 ++ pmaddwd m10, m10 ++ pmaddwd m6, m6 ++ MULLD m10, [div_table+48] ++ MULLD m6, [div_table+32] ++ paddd m6, m10 ; cost[1a-d] ++ ++ pshufd m0, m0, q1032 ++ pshufd m1, m1, q1032 ++ pshufd m2, m2, q1032 ++ pshufd m3, m3, q1032 ++ ++ pslldq m10, m0, 6 ++ psrldq m11, m0, 10 ++ pslldq m12, m1, 4 ++ psrldq m13, m1, 12 ++ pslldq m14, m2, 2 ++ psrldq m2, 14 ++ paddw m10, m12 ++ paddw m11, m13 ++ paddw m10, m14 ++ paddw m11, m2 ++ paddw m10, m3 ++ pshuflw m11, m11, q3012 ++ punpckhwd m12, m10, m11 ++ punpcklwd m10, m11 ++ pmaddwd m12, m12 ++ pmaddwd m10, m10 ++ MULLD m12, [div_table+48] ++ MULLD m10, [div_table+32] ++ paddd m10, m12 ; cost[3a-d] ++ ++ phaddd m0, m9, m8 ; cost[0,4,2,6] ++ phaddd m6, m5 ++ phaddd m10, m4 ++ phaddd m1, m6, m10 ; cost[1,5,3,7] ++ ++ pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6] ++ pand m3, m2, m1 ++ pandn m4, m2, m0 ++ por m3, m4 ; higher 4 values ++ pshufd m1, m1, q2301 ++ pshufd m0, m0, q2301 ++ pand m1, m2, m1 ++ pandn m4, m2, m0 ++ por m0, m4, m1 ; 4 values at idx^4 offset ++ pand m14, m2, [pd_0to7+16] ++ pandn m15, m2, [pd_0to7] ++ por m15, m14 ++ ++ punpckhqdq m4, m3, m0 ++ punpcklqdq m3, m0 ++ pcmpgtd m5, m4, m3 ; [2or3-6or7] > [0or1/4or5] ++ punpcklqdq m5, m5 ++ pand m6, m5, m4 ++ pandn m7, m5, m3 ++ por m6, m7 ; { highest 2 values, complements at idx^4 } ++ movhlps m14, m15 ++ pand m14, m5, m14 ++ pandn m13, m5, m15 ++ por m15, m13, m14 ++ ++ pshufd m7, m6, q3311 ++ pcmpgtd m8, m7, m6 ; [4or5or6or7] > [0or1or2or3] ++ punpcklqdq m8, m8 ++ pand m9, m8, m7 ++ pandn m10, m8, m6 ++ por m9, m10 ; max ++ movhlps m10, m9 ; complement at idx^4 ++ psubd m9, m10 ++ psrld m9, 10 ++ movd [varq], m9 ++ pshufd m14, m15, q1111 ++ pand m14, m8, m14 ++ pandn m13, m8, m15 ++ por m15, m13, m14 ++ movd eax, m15 ++%else ++cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3 ++ %define PIC_reg r4 ++ LEA PIC_reg, PIC_base_offset ++ ++ pxor m0, m0 ++ mova m1, [PIC_sym(pw_128)] ++ ++ lea stride3q, [strideq*3] ++ movq m5, [srcq+strideq*0] ++ movhps m5, [srcq+strideq*1] ++ movq m7, [srcq+strideq*2] ++ movhps m7, [srcq+stride3q] ++ psadbw m2, m5, m0 ++ psadbw m3, m7, m0 ++ packssdw m2, m3 ++ punpcklbw m4, m5, m0 ++ punpckhbw m5, m0 ++ punpcklbw m6, m7, m0 ++ punpckhbw m7, m0 ++ psubw m4, m1 ++ psubw m5, m1 ++ psubw m6, m1 ++ psubw m7, m1 ++ ++ mova [esp+0x00], m4 ++ mova [esp+0x10], m5 ++ mova [esp+0x20], m6 ++ mova [esp+0x50], m7 ++ ++ lea srcq, [srcq+strideq*4] ++ movq m5, [srcq+strideq*0] ++ movhps m5, [srcq+strideq*1] ++ movq m7, [srcq+strideq*2] ++ movhps m7, [srcq+stride3q] ++ psadbw m3, m5, m0 ++ psadbw m0, m7, m0 ++ packssdw m3, m0 ++ pxor m0, m0 ++ packssdw m2, m3 ++ punpcklbw m4, m5, m0 ++ punpckhbw m5, m0 ++ punpcklbw m6, m7, m0 ++ punpckhbw m7, m0 ++ psubw m4, m1 ++ psubw m5, m1 ++ psubw m6, m1 ++ psubw m7, m1 ++ ++ psllw m1, 3 ++ psubw m2, m1 ; partial_sum_hv[0] ++ pmaddwd m2, m2 ++ ++ mova m3, [esp+0x50] ++ mova m0, [esp+0x00] ++ paddw m0, [esp+0x10] ++ paddw m1, m3, [esp+0x20] ++ paddw m0, m4 ++ paddw m1, m5 ++ paddw m0, m6 ++ paddw m1, m7 ++ paddw m0, m1 ; partial_sum_hv[1] ++ pmaddwd m0, m0 ++ ++ phaddd m2, m0 ++ MULLD m2, [PIC_sym(div_table)+48] ++ mova [esp+0x30], m2 ++ ++ mova m1, [esp+0x10] ++ pslldq m0, m1, 2 ++ psrldq m1, 14 ++ paddw m0, [esp+0x00] ++ pslldq m2, m3, 6 ++ psrldq m3, 10 ++ paddw m0, m2 ++ paddw m1, m3 ++ mova m3, [esp+0x20] ++ pslldq m2, m3, 4 ++ psrldq m3, 12 ++ paddw m0, m2 ; partial_sum_diag[0] top/left half ++ paddw m1, m3 ; partial_sum_diag[0] top/right half ++ pslldq m2, m4, 8 ++ psrldq m3, m4, 8 ++ paddw m0, m2 ++ paddw m1, m3 ++ pslldq m2, m5, 10 ++ psrldq m3, m5, 6 ++ paddw m0, m2 ++ paddw m1, m3 ++ pslldq m2, m6, 12 ++ psrldq m3, m6, 4 ++ paddw m0, m2 ++ paddw m1, m3 ++ pslldq m2, m7, 14 ++ psrldq m3, m7, 2 ++ paddw m0, m2 ; partial_sum_diag[0][0-7] ++ paddw m1, m3 ; partial_sum_diag[0][8-14,zero] ++ mova m3, [esp+0x50] ++ pshufb m1, [PIC_sym(shufw_6543210x)] ++ punpckhwd m2, m0, m1 ++ punpcklwd m0, m1 ++ pmaddwd m2, m2 ++ pmaddwd m0, m0 ++ MULLD m2, [PIC_sym(div_table)+16] ++ MULLD m0, [PIC_sym(div_table)+0] ++ paddd m0, m2 ; cost[0a-d] ++ mova [esp+0x40], m0 ++ ++ mova m1, [esp+0x00] ++ pslldq m0, m1, 14 ++ psrldq m1, 2 ++ paddw m0, m7 ++ pslldq m2, m3, 8 ++ psrldq m3, 8 ++ paddw m0, m2 ++ paddw m1, m3 ++ mova m3, [esp+0x20] ++ pslldq m2, m3, 10 ++ psrldq m3, 6 ++ paddw m0, m2 ++ paddw m1, m3 ++ mova m3, [esp+0x10] ++ pslldq m2, m3, 12 ++ psrldq m3, 4 ++ paddw m0, m2 ++ paddw m1, m3 ++ pslldq m2, m4, 6 ++ psrldq m3, m4, 10 ++ paddw m0, m2 ++ paddw m1, m3 ++ pslldq m2, m5, 4 ++ psrldq m3, m5, 12 ++ paddw m0, m2 ++ paddw m1, m3 ++ pslldq m2, m6, 2 ++ psrldq m3, m6, 14 ++ paddw m0, m2 ; partial_sum_diag[1][0-7] ++ paddw m1, m3 ; partial_sum_diag[1][8-14,zero] ++ mova m3, [esp+0x50] ++ pshufb m1, [PIC_sym(shufw_6543210x)] ++ punpckhwd m2, m0, m1 ++ punpcklwd m0, m1 ++ pmaddwd m2, m2 ++ pmaddwd m0, m0 ++ MULLD m2, [PIC_sym(div_table)+16] ++ MULLD m0, [PIC_sym(div_table)+0] ++ paddd m0, m2 ; cost[4a-d] ++ phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] ++ phaddd m1, [esp+0x30] ; cost[0,4,2,6] ++ mova [esp+0x30], m1 ++ ++ phaddw m0, [esp+0x00], m4 ++ phaddw m1, [esp+0x10], m5 ++ paddw m4, m5 ++ mova m2, [esp+0x20] ++ paddw m5, m2, m3 ++ phaddw m2, m6 ++ paddw m6, m7 ++ phaddw m3, m7 ++ mova m7, [esp+0x00] ++ paddw m7, [esp+0x10] ++ mova [esp+0x00], m0 ++ mova [esp+0x10], m1 ++ mova [esp+0x20], m2 ++ ++ pslldq m1, m4, 4 ++ pslldq m2, m6, 6 ++ pslldq m0, m5, 2 ++ paddw m1, m2 ++ paddw m0, m7 ++ psrldq m2, m5, 14 ++ paddw m0, m1 ; partial_sum_alt[3] left ++ psrldq m1, m4, 12 ++ paddw m1, m2 ++ psrldq m2, m6, 10 ++ paddw m1, m2 ; partial_sum_alt[3] right ++ pshuflw m1, m1, q3012 ++ punpckhwd m2, m0, m1 ++ punpcklwd m0, m1 ++ pmaddwd m2, m2 ++ pmaddwd m0, m0 ++ MULLD m2, [PIC_sym(div_table)+48] ++ MULLD m0, [PIC_sym(div_table)+32] ++ paddd m0, m2 ; cost[7a-d] ++ mova [esp+0x40], m0 ++ ++ pslldq m0, m7, 6 ++ psrldq m7, 10 ++ pslldq m1, m5, 4 ++ psrldq m5, 12 ++ pslldq m2, m4, 2 ++ psrldq m4, 14 ++ paddw m0, m6 ++ paddw m7, m5 ++ paddw m0, m1 ++ paddw m7, m4 ++ paddw m0, m2 ++ pshuflw m7, m7, q3012 ++ punpckhwd m2, m0, m7 ++ punpcklwd m0, m7 ++ pmaddwd m2, m2 ++ pmaddwd m0, m0 ++ MULLD m2, [PIC_sym(div_table)+48] ++ MULLD m0, [PIC_sym(div_table)+32] ++ paddd m0, m2 ; cost[5a-d] ++ mova [esp+0x50], m0 ++ ++ mova m1, [esp+0x10] ++ mova m2, [esp+0x20] ++ pslldq m0, m1, 2 ++ psrldq m1, 14 ++ pslldq m4, m2, 4 ++ psrldq m2, 12 ++ pslldq m5, m3, 6 ++ psrldq m6, m3, 10 ++ paddw m0, [esp+0x00] ++ paddw m1, m2 ++ paddw m4, m5 ++ paddw m1, m6 ; partial_sum_alt[3] right ++ paddw m0, m4 ; partial_sum_alt[3] left ++ pshuflw m1, m1, q3012 ++ punpckhwd m2, m0, m1 ++ punpcklwd m0, m1 ++ pmaddwd m2, m2 ++ pmaddwd m0, m0 ++ MULLD m2, [PIC_sym(div_table)+48] ++ MULLD m0, [PIC_sym(div_table)+32] ++ paddd m0, m2 ; cost[1a-d] ++ phaddd m0, [esp+0x50] ++ mova [esp+0x50], m0 ++ ++ pshufd m0, [esp+0x00], q1032 ++ pshufd m1, [esp+0x10], q1032 ++ pshufd m2, [esp+0x20], q1032 ++ pshufd m3, m3, q1032 ++ ++ pslldq m4, m0, 6 ++ psrldq m0, 10 ++ pslldq m5, m1, 4 ++ psrldq m1, 12 ++ pslldq m6, m2, 2 ++ psrldq m2, 14 ++ paddw m4, m3 ++ paddw m0, m1 ++ paddw m5, m6 ++ paddw m0, m2 ++ paddw m4, m5 ++ pshuflw m0, m0, q3012 ++ punpckhwd m2, m4, m0 ++ punpcklwd m4, m0 ++ pmaddwd m2, m2 ++ pmaddwd m4, m4 ++ MULLD m2, [PIC_sym(div_table)+48] ++ MULLD m4, [PIC_sym(div_table)+32] ++ paddd m4, m2 ; cost[3a-d] ++ phaddd m4, [esp+0x40] ++ ++ mova m1, [esp+0x50] ++ mova m0, [esp+0x30] ; cost[0,4,2,6] ++ phaddd m1, m4 ; cost[1,5,3,7] ++ ++ pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6] ++ pand m3, m2, m1 ++ pandn m4, m2, m0 ++ por m3, m4 ; higher 4 values ++ pshufd m1, m1, q2301 ++ pshufd m0, m0, q2301 ++ pand m1, m2, m1 ++ pandn m4, m2, m0 ++ por m0, m4, m1 ; 4 values at idx^4 offset ++ pand m5, m2, [PIC_sym(pd_0to7)+16] ++ pandn m6, m2, [PIC_sym(pd_0to7)] ++ por m6, m5 ++ ++ punpckhqdq m4, m3, m0 ++ punpcklqdq m3, m0 ++ pcmpgtd m0, m4, m3 ; [2or3-6or7] > [0or1/4or5] ++ punpcklqdq m0, m0 ++ pand m1, m0, m4 ++ pandn m7, m0, m3 ++ por m1, m7 ; { highest 2 values, complements at idx^4 } ++ movhlps m5, m6 ++ pand m5, m0, m5 ++ pandn m3, m0, m6 ++ por m6, m3, m5 ++ ++ pshufd m7, m1, q3311 ++ pcmpgtd m2, m7, m1 ; [4or5or6or7] > [0or1or2or3] ++ punpcklqdq m2, m2 ++ pand m0, m2, m7 ++ pandn m7, m2, m1 ++ por m0, m7 ; max ++ movhlps m7, m0 ; complement at idx^4 ++ psubd m0, m7 ++ psrld m0, 10 ++ movd [varq], m0 ++ pshufd m5, m6, q1111 ++ pand m5, m2, m5 ++ pandn m3, m2, m6 ++ por m6, m3, m5 ++ movd eax, m6 ++%endif ++ ++ RET +diff --git third_party/dav1d/src/x86/cpu.c third_party/dav1d/src/x86/cpu.c +index 95ec22ea85aa..82c403c47baf 100644 +--- third_party/dav1d/src/x86/cpu.c ++++ third_party/dav1d/src/x86/cpu.c +@@ -57,7 +57,8 @@ unsigned dav1d_get_cpu_flags_x86(void) { + if (info[2] & (1 << 28)) flags |= DAV1D_X86_CPU_FLAG_AVX; + if (n_ids >= 7) { + dav1d_cpu_cpuid(info, 7); +- if (info[1] & (1 << 5)) flags |= DAV1D_X86_CPU_FLAG_AVX2; ++ if ((info[1] & 0x00000128) == 0x00000128) ++ flags |= DAV1D_X86_CPU_FLAG_AVX2; + if ((xcr & 0x000000e0) == 0x000000e0) /* ZMM/OPMASK */ { + if ((info[1] & 0xd0030000) == 0xd0030000) + flags |= DAV1D_X86_CPU_FLAG_AVX512; +diff --git third_party/dav1d/src/x86/cpu.h third_party/dav1d/src/x86/cpu.h +index e630d97d0f4d..2beae78bae86 100644 +--- third_party/dav1d/src/x86/cpu.h ++++ third_party/dav1d/src/x86/cpu.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_SRC_X86_CPU_H__ +-#define __DAV1D_SRC_X86_CPU_H__ ++#ifndef DAV1D_SRC_X86_CPU_H ++#define DAV1D_SRC_X86_CPU_H + + enum CpuFlags { + DAV1D_X86_CPU_FLAG_SSE = 1 << 0, +@@ -42,4 +42,4 @@ enum CpuFlags { + + unsigned dav1d_get_cpu_flags_x86(void); + +-#endif /* __DAV1D_SRC_X86_CPU_H__ */ ++#endif /* DAV1D_SRC_X86_CPU_H */ +diff --git third_party/dav1d/src/x86/ipred.asm third_party/dav1d/src/x86/ipred.asm +index f4f26730ccef..2861cf8323ce 100644 +--- third_party/dav1d/src/x86/ipred.asm ++++ third_party/dav1d/src/x86/ipred.asm +@@ -3483,21 +3483,21 @@ ALIGN function_align + sub rsp, stack_size_padded + sub hd, 2 + lea r3, [dstq+16] +- mov r5d, hd ++ lea r5d, [hq-2] + call .w16_main + add tlq, r5 + mov dstq, r3 + lea r3, [strideq-4] + lea r4, [r3+strideq*2] +- movq xm0, [tlq+19] ++ movq xm0, [tlq+21] + pinsrd xm0, [dstq-4], 2 + pinsrd xm0, [dstq+r3*1], 3 + FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 + movq xm7, [dstq+r3*2] + pinsrd xm7, [dstq+r4], 2 + palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 +- vpbroadcastd m0, [tlq+26] +- vpbroadcastd m9, [tlq+27] ++ vpbroadcastd m0, [tlq+28] ++ vpbroadcastd m9, [tlq+29] + vbroadcasti128 m8, [base+filter_shuf1+16] + vpblendd m0, m9, 0x20 + vpblendd m0, m7, 0x0f +@@ -3506,16 +3506,17 @@ ALIGN function_align + call .main ; c0 d0 a1 b1 a1 b1 c0 d0 + add r3, 2 + lea r4, [r4+strideq*2] +- movlps xm9, xm7, [tlq+27] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 ++ movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 + FILTER_XMM 6, 9, 10, 14 + vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 +- vpbroadcastd m9, [tlq+35] +- vpbroadcastd m10, [tlq+34] ++ vpbroadcastd m9, [tlq+37] ++ vpbroadcastd m10, [tlq+36] + vpblendd m6, m9, 0x20 ; top + .w32_loop: + movq xm9, [dstq+r3*4] + pinsrd xm9, [dstq+r4], 2 ++.w32_loop_last: + palignr m9, m0, 12 + vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm13, xm7 ; c0 d0 +@@ -3535,6 +3536,7 @@ ALIGN function_align + lea dstq, [dstq+strideq*2] + sub r5d, 2 + jg .w32_loop ++ jz .w32_loop_last + vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 + pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] +diff --git third_party/dav1d/src/x86/ipred_init_tmpl.c third_party/dav1d/src/x86/ipred_init_tmpl.c +index 3662d40069ec..fd184902a416 100644 +--- third_party/dav1d/src/x86/ipred_init_tmpl.c ++++ third_party/dav1d/src/x86/ipred_init_tmpl.c +@@ -58,6 +58,16 @@ decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3); + decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3); + decl_angular_ipred_fn(dav1d_ipred_h_ssse3); + decl_angular_ipred_fn(dav1d_ipred_v_ssse3); ++decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3); ++decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3); ++decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3); ++ ++decl_cfl_pred_fn(dav1d_ipred_cfl_ssse3); ++decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3); ++decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3); ++decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3); ++ ++decl_pal_pred_fn(dav1d_pal_pred_ssse3); + + void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); +@@ -71,6 +81,16 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) { + c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3; + c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3; + c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3; ++ c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3; ++ c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3; ++ c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3; ++ ++ c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_ssse3; ++ c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_ssse3; ++ c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_ssse3; ++ c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_ssse3; ++ ++ c->pal_pred = dav1d_pal_pred_ssse3; + #endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; +diff --git third_party/dav1d/src/x86/ipred_ssse3.asm third_party/dav1d/src/x86/ipred_ssse3.asm +index bfa3621218ba..3c63f95750e6 100644 +--- third_party/dav1d/src/x86/ipred_ssse3.asm ++++ third_party/dav1d/src/x86/ipred_ssse3.asm +@@ -29,8 +29,46 @@ + + SECTION_RODATA 16 + +-pb_128 : times 8 db 128 +-pd_32768 : times 1 dd 32768 ++%macro SMOOTH_WEIGHT_TABLE 1-* ++ %rep %0 ++ db %1-128, 127-%1 ++ %rotate 1 ++ %endrep ++%endmacro ++ ++; sm_weights[], but modified to precalculate x and 256-x with offsets to ++; enable efficient use of pmaddubsw (which requires signed values) ++smooth_weights: SMOOTH_WEIGHT_TABLE \ ++ 0, 0, 255, 128, 255, 149, 85, 64, \ ++ 255, 197, 146, 105, 73, 50, 37, 32, \ ++ 255, 225, 196, 170, 145, 123, 102, 84, \ ++ 68, 54, 43, 33, 26, 20, 17, 16, \ ++ 255, 240, 225, 210, 196, 182, 169, 157, \ ++ 145, 133, 122, 111, 101, 92, 83, 74, \ ++ 66, 59, 52, 45, 39, 34, 29, 25, \ ++ 21, 17, 14, 12, 10, 9, 8, 8, \ ++ 255, 248, 240, 233, 225, 218, 210, 203, \ ++ 196, 189, 182, 176, 169, 163, 156, 150, \ ++ 144, 138, 133, 127, 121, 116, 111, 106, \ ++ 101, 96, 91, 86, 82, 77, 73, 69, \ ++ 65, 61, 57, 54, 50, 47, 44, 41, \ ++ 38, 35, 32, 29, 27, 25, 22, 20, \ ++ 18, 16, 15, 13, 12, 10, 9, 8, \ ++ 7, 6, 6, 5, 5, 4, 4, 4 ++ ++ ++ipred_v_shuf : db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 ++ipred_h_shuf : db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 ++ ++pb_3 : times 16 db 3 ++pb_128 : times 8 db 128 ++pw_128 : times 4 dw 128 ++pw_255 : times 4 dw 255 ++pb_127_m127 : times 4 db 127, -127 ++pd_32768 : times 1 dd 32768 ++ ++ ++ + + %macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) +@@ -43,11 +81,21 @@ pd_32768 : times 1 dd 32768 + %endmacro + + %define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) ++%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) + +-JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 +-JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ ++JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 ++JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +-JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 ++JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 ++JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 ++JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 ++JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 ++JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 ++JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ ++ s4-8*4, s8-8*4, s16-8*4, s32-8*4 ++JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 ++ ++ + + SECTION .text + +@@ -470,3 +518,1089 @@ cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h + add wq, r5 + jmp r6 + ++;--------------------------------------------------------------------------------------- ++;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ++; const int width, const int height, const int a); ++;--------------------------------------------------------------------------------------- ++%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] ++ ; w * a = (w - 128) * a + 128 * a ++ ; (256 - w) * b = (127 - w) * b + 129 * b ++ ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] ++ pmaddubsw m6, m%3, m%1 ++ pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b ++ paddw m6, m%5 ++ paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] ++ psrlw m6, 8 ++ psrlw m0, 8 ++ packuswb m6, m0 ++%endmacro ++ ++cglobal ipred_smooth_v, 3, 7, 7, dst, stride, tl, w, h, weights ++%define base r6-ipred_smooth_v_ssse3_table ++ LEA r6, ipred_smooth_v_ssse3_table ++ tzcnt wd, wm ++ mov hd, hm ++ movsxd wq, [r6+wq*4] ++ movddup m0, [base+pb_127_m127] ++ movddup m1, [base+pw_128] ++ lea weightsq, [base+smooth_weights+hq*4] ++ neg hq ++ movd m5, [tlq+hq] ++ pxor m2, m2 ++ pshufb m5, m2 ++ add wq, r6 ++ jmp wq ++.w4: ++ movd m2, [tlq+1] ++ punpckldq m2, m2 ++ punpcklbw m2, m5 ; top, bottom ++ lea r3, [strideq*3] ++ mova m4, [base+ipred_v_shuf] ++ mova m5, m4 ++ punpckldq m4, m4 ++ punpckhdq m5, m5 ++ pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom ++ paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok ++ paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 ++.w4_loop: ++ movu m1, [weightsq+hq*2] ++ pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop ++ pshufb m1, m5 ++ SMOOTH 0, 1, 2, 2, 3, 3 ++ movd [dstq+strideq*0], m6 ++ pshuflw m1, m6, q1032 ++ movd [dstq+strideq*1], m1 ++ punpckhqdq m6, m6 ++ movd [dstq+strideq*2], m6 ++ psrlq m6, 32 ++ movd [dstq+r3 ], m6 ++ lea dstq, [dstq+strideq*4] ++ add hq, 4 ++ jl .w4_loop ++ RET ++ALIGN function_align ++.w8: ++ movq m2, [tlq+1] ++ punpcklbw m2, m5 ++ mova m5, [base+ipred_v_shuf] ++ lea r3, [strideq*3] ++ pshufd m4, m5, q0000 ++ pshufd m5, m5, q1111 ++ pmaddubsw m3, m2, m0 ++ paddw m1, m2 ++ paddw m3, m1 ; m3 is output for loop ++.w8_loop: ++ movq m1, [weightsq+hq*2] ++ pshufb m0, m1, m4 ++ pshufb m1, m5 ++ SMOOTH 0, 1, 2, 2, 3, 3 ++ movq [dstq+strideq*0], m6 ++ movhps [dstq+strideq*1], m6 ++ lea dstq, [dstq+strideq*2] ++ add hq, 2 ++ jl .w8_loop ++ RET ++ALIGN function_align ++.w16: ++ movu m3, [tlq+1] ++ punpcklbw m2, m3, m5 ++ punpckhbw m3, m5 ++ pmaddubsw m4, m2, m0 ++ pmaddubsw m5, m3, m0 ++ paddw m0, m1, m2 ++ paddw m1, m3 ++ paddw m4, m0 ++ paddw m5, m1 ; m4 and m5 is output for loop ++.w16_loop: ++ movd m1, [weightsq+hq*2] ++ pshuflw m1, m1, q0000 ++ punpcklqdq m1, m1 ++ SMOOTH 1, 1, 2, 3, 4, 5 ++ mova [dstq], m6 ++ add dstq, strideq ++ add hq, 1 ++ jl .w16_loop ++ RET ++ALIGN function_align ++.w32: ++%if WIN64 ++ movaps [rsp+24], xmm7 ++ %define xmm_regs_used 8 ++%endif ++ mova m7, m5 ++.w32_loop_init: ++ mov r3d, 2 ++.w32_loop: ++ movddup m0, [base+pb_127_m127] ++ movddup m1, [base+pw_128] ++ movu m3, [tlq+1] ++ punpcklbw m2, m3, m7 ++ punpckhbw m3, m7 ++ pmaddubsw m4, m2, m0 ++ pmaddubsw m5, m3, m0 ++ paddw m0, m1, m2 ++ paddw m1, m3 ++ paddw m4, m0 ++ paddw m5, m1 ++ movd m1, [weightsq+hq*2] ++ pshuflw m1, m1, q0000 ++ punpcklqdq m1, m1 ++ SMOOTH 1, 1, 2, 3, 4, 5 ++ mova [dstq], m6 ++ add tlq, 16 ++ add dstq, 16 ++ dec r3d ++ jg .w32_loop ++ lea dstq, [dstq-32+strideq] ++ sub tlq, 32 ++ add hq, 1 ++ jl .w32_loop_init ++ RET ++ALIGN function_align ++.w64: ++%if WIN64 ++ movaps [rsp+24], xmm7 ++ %define xmm_regs_used 8 ++%endif ++ mova m7, m5 ++.w64_loop_init: ++ mov r3d, 4 ++.w64_loop: ++ movddup m0, [base+pb_127_m127] ++ movddup m1, [base+pw_128] ++ movu m3, [tlq+1] ++ punpcklbw m2, m3, m7 ++ punpckhbw m3, m7 ++ pmaddubsw m4, m2, m0 ++ pmaddubsw m5, m3, m0 ++ paddw m0, m1, m2 ++ paddw m1, m3 ++ paddw m4, m0 ++ paddw m5, m1 ++ movd m1, [weightsq+hq*2] ++ pshuflw m1, m1, q0000 ++ punpcklqdq m1, m1 ++ SMOOTH 1, 1, 2, 3, 4, 5 ++ mova [dstq], m6 ++ add tlq, 16 ++ add dstq, 16 ++ dec r3d ++ jg .w64_loop ++ lea dstq, [dstq-64+strideq] ++ sub tlq, 64 ++ add hq, 1 ++ jl .w64_loop_init ++ RET ++ ++;--------------------------------------------------------------------------------------- ++;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ++; const int width, const int height, const int a); ++;--------------------------------------------------------------------------------------- ++cglobal ipred_smooth_h, 3, 7, 8, dst, stride, tl, w, h ++%define base r6-ipred_smooth_h_ssse3_table ++ LEA r6, ipred_smooth_h_ssse3_table ++ mov wd, wm ++ movd m3, [tlq+wq] ++ pxor m1, m1 ++ pshufb m3, m1 ; right ++ tzcnt wd, wd ++ mov hd, hm ++ movsxd wq, [r6+wq*4] ++ movddup m4, [base+pb_127_m127] ++ movddup m5, [base+pw_128] ++ add wq, r6 ++ jmp wq ++.w4: ++ movddup m6, [base+smooth_weights+4*2] ++ mova m7, [base+ipred_h_shuf] ++ sub tlq, 4 ++ sub tlq, hq ++ lea r3, [strideq*3] ++.w4_loop: ++ movd m2, [tlq+hq] ; left ++ pshufb m2, m7 ++ punpcklbw m1, m2, m3 ; left, right ++ punpckhbw m2, m3 ++ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right ++ paddw m0, m1 ; 128 * left + 129 * right ++ pmaddubsw m1, m6 ++ paddw m1, m5 ++ paddw m0, m1 ++ pmaddubsw m1, m2, m4 ++ paddw m1, m2 ++ pmaddubsw m2, m6 ++ paddw m2, m5 ++ paddw m1, m2 ++ psrlw m0, 8 ++ psrlw m1, 8 ++ packuswb m0, m1 ++ movd [dstq+strideq*0], m0 ++ pshuflw m1, m0, q1032 ++ movd [dstq+strideq*1], m1 ++ punpckhqdq m0, m0 ++ movd [dstq+strideq*2], m0 ++ psrlq m0, 32 ++ movd [dstq+r3 ], m0 ++ lea dstq, [dstq+strideq*4] ++ sub hd, 4 ++ jg .w4_loop ++ RET ++ALIGN function_align ++.w8: ++ mova m6, [base+smooth_weights+8*2] ++ mova m7, [base+ipred_h_shuf] ++ sub tlq, 4 ++ sub tlq, hq ++ punpckldq m7, m7 ++.w8_loop: ++ movd m2, [tlq+hq] ; left ++ pshufb m2, m7 ++ punpcklbw m1, m2, m3 ; left, right ++ punpckhbw m2, m3 ++ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right ++ paddw m0, m1 ; 128 * left + 129 * right ++ pmaddubsw m1, m6 ++ paddw m1, m5 ++ paddw m0, m1 ++ pmaddubsw m1, m2, m4 ++ paddw m1, m2 ++ pmaddubsw m2, m6 ++ paddw m2, m5 ++ paddw m1, m2 ++ psrlw m0, 8 ++ psrlw m1, 8 ++ packuswb m0, m1 ++ movq [dstq+strideq*0], m0 ++ movhps [dstq+strideq*1], m0 ++ lea dstq, [dstq+strideq*2] ++ sub hd, 2 ++ jg .w8_loop ++ RET ++ALIGN function_align ++.w16: ++ mova m6, [base+smooth_weights+16*2] ++ mova m7, [base+smooth_weights+16*3] ++ sub tlq, 1 ++ sub tlq, hq ++.w16_loop: ++ pxor m1, m1 ++ movd m2, [tlq+hq] ; left ++ pshufb m2, m1 ++ punpcklbw m1, m2, m3 ; left, right ++ punpckhbw m2, m3 ++ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right ++ paddw m0, m1 ; 128 * left + 129 * right ++ pmaddubsw m1, m6 ++ paddw m1, m5 ++ paddw m0, m1 ++ pmaddubsw m1, m2, m4 ++ paddw m1, m2 ++ pmaddubsw m2, m7 ++ paddw m2, m5 ++ paddw m1, m2 ++ psrlw m0, 8 ++ psrlw m1, 8 ++ packuswb m0, m1 ++ mova [dstq], m0 ++ lea dstq, [dstq+strideq] ++ sub hd, 1 ++ jg .w16_loop ++ RET ++ALIGN function_align ++.w32: ++ sub tlq, 1 ++ sub tlq, hq ++ pxor m6, m6 ++.w32_loop_init: ++ mov r5, 2 ++ lea r3, [base+smooth_weights+16*4] ++.w32_loop: ++ mova m7, [r3] ++ add r3, 16 ++ movd m2, [tlq+hq] ; left ++ pshufb m2, m6 ++ punpcklbw m1, m2, m3 ; left, right ++ punpckhbw m2, m3 ++ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right ++ paddw m0, m1 ; 128 * left + 129 * right ++ pmaddubsw m1, m7 ++ paddw m1, m5 ++ paddw m0, m1 ++ pmaddubsw m1, m2, m4 ++ paddw m1, m2 ++ mova m7, [r3] ++ add r3, 16 ++ pmaddubsw m2, m7 ++ paddw m2, m5 ++ paddw m1, m2 ++ psrlw m0, 8 ++ psrlw m1, 8 ++ packuswb m0, m1 ++ mova [dstq], m0 ++ add dstq, 16 ++ dec r5 ++ jg .w32_loop ++ lea dstq, [dstq-32+strideq] ++ sub hd, 1 ++ jg .w32_loop_init ++ RET ++ALIGN function_align ++.w64: ++ sub tlq, 1 ++ sub tlq, hq ++ pxor m6, m6 ++.w64_loop_init: ++ mov r5, 4 ++ lea r3, [base+smooth_weights+16*8] ++.w64_loop: ++ mova m7, [r3] ++ add r3, 16 ++ movd m2, [tlq+hq] ; left ++ pshufb m2, m6 ++ punpcklbw m1, m2, m3 ; left, right ++ punpckhbw m2, m3 ++ pmaddubsw m0, m1, m4 ; 127 * left - 127 * right ++ paddw m0, m1 ; 128 * left + 129 * right ++ pmaddubsw m1, m7 ++ paddw m1, m5 ++ paddw m0, m1 ++ pmaddubsw m1, m2, m4 ++ paddw m1, m2 ++ mova m7, [r3] ++ add r3, 16 ++ pmaddubsw m2, m7 ++ paddw m2, m5 ++ paddw m1, m2 ++ psrlw m0, 8 ++ psrlw m1, 8 ++ packuswb m0, m1 ++ mova [dstq], m0 ++ add dstq, 16 ++ dec r5 ++ jg .w64_loop ++ lea dstq, [dstq-64+strideq] ++ sub hd, 1 ++ jg .w64_loop_init ++ RET ++ ++;--------------------------------------------------------------------------------------- ++;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ++; const int width, const int height, const int a); ++;--------------------------------------------------------------------------------------- ++%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 ++ pmaddubsw m6, m%3, m%1 ++ mova m0, m6 ++ pmaddubsw m6, m%4, m%2 ++ mova m1, m6 ++%ifnum %5 ++ paddw m0, m%5 ++%else ++ paddw m0, %5 ++%endif ++%ifnum %6 ++ paddw m1, m%6 ++%else ++ paddw m1, %6 ++%endif ++%ifnum %7 ++%else ++ mova m3, %7 ++%endif ++ pavgw m0, m2 ++ pavgw m1, m3 ++ psrlw m0, 8 ++ psrlw m1, 8 ++ packuswb m0, m1 ++%endmacro ++ ++%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] ++ mova m1, [rsp+16*%1] ; top ++ punpckhbw m6, m1, m0 ; top, bottom ++ punpcklbw m1, m0 ; top, bottom ++ pmaddubsw m2, m1, m5 ++ mova [rsp+16*%2], m1 ++ paddw m1, m3 ; 1 * top + 255 * bottom + 255 ++ paddw m2, m1 ; 128 * top + 129 * bottom + 255 ++ mova [rsp+16*%3], m2 ++ pmaddubsw m2, m6, m5 ++ mova [rsp+16*%4], m6 ++ paddw m6, m3 ; 1 * top + 255 * bottom + 255 ++ paddw m2, m6 ; 128 * top + 129 * bottom + 255 ++ mova [rsp+16*%5], m2 ++ movd m1, [tlq+hq] ; left ++ pshufb m1, [base+pb_3] ; topleft[-(1 + y)] ++ punpcklbw m1, m4 ; left, right ++ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right ++ paddw m2, m1 ; 128 * left + 129 * right ++ mova m3, m2 ++ pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; ++ pmaddubsw m1, %7 ++ paddw m2, m3, m0 ++ paddw m3, m1 ++ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; ++ mova m7, [rsp+16*%9] ++ pshufb m1, m7 ++ mova [rsp+16*%8], m3 ++ mova m4, [rsp+16*%2] ++ mova m5, [rsp+16*%3] ++ mova m3, [rsp+16*%4] ++ mova m7, [rsp+16*%5] ++ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] ++ mova [dstq], m0 ++ movddup m3, [base+pw_255] ; recovery ++ mova m0, [rsp+16*%10] ; recovery ++ mova m4, [rsp+16*%11] ; recovery ++ mova m5, [rsp+16*%12] ; recovery ++%endmacro ++ ++cglobal ipred_smooth, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights ++%define base r6-ipred_smooth_ssse3_table ++ mov wd, wm ++ mov hd, hm ++ LEA r6, ipred_smooth_ssse3_table ++ movd m4, [tlq+wq] ; right ++ pxor m2, m2 ++ pshufb m4, m2 ++ tzcnt wd, wd ++ mov r5, tlq ++ sub r5, hq ++ movsxd wq, [r6+wq*4] ++ movddup m5, [base+pb_127_m127] ++ movd m0, [r5] ++ pshufb m0, m2 ; bottom ++ movddup m3, [base+pw_255] ++ add wq, r6 ++ lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] ++ jmp wq ++.w4: ++ mova m7, [base+ipred_v_shuf] ++ movd m1, [tlq+1] ; left ++ pshufd m1, m1, q0000 ++ sub tlq, 4 ++ lea r3, [strideq*3] ++ sub tlq, hq ++ punpcklbw m1, m0 ; top, bottom ++ pshufd m6, m7, q1100 ++ pshufd m7, m7, q3322 ++ pmaddubsw m2, m1, m5 ++ paddw m3, m1 ; 1 * top + 255 * bottom + 255 ++ paddw m2, m3 ; 128 * top + 129 * bottom + 255 ++ mova [rsp+16*0], m1 ++ mova [rsp+16*1], m2 ++ movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; ++ punpcklqdq m1, m1 ++ mova [rsp+16*2], m1 ++ mova [rsp+16*3], m4 ++ mova [rsp+16*4], m6 ++ mova [rsp+16*5], m5 ++.w4_loop: ++ movd m1, [tlq+hq] ; left ++ pshufb m1, [base+ipred_h_shuf] ++ punpcklbw m0, m1, m4 ; left, right ++ punpckhbw m1, m4 ++ pmaddubsw m2, m0, m5 ; 127 * left - 127 * right ++ pmaddubsw m3, m1, m5 ++ paddw m2, m0 ; 128 * left + 129 * right ++ paddw m3, m1 ++ mova m4, [rsp+16*2] ++ pmaddubsw m0, m4 ++ pmaddubsw m1, m4 ++ paddw m2, m0 ++ paddw m3, m1 ++ movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; ++ add v_weightsq, 8 ++ pshufb m0, m1, m6 ++ pshufb m1, m7 ++ mova m4, [rsp+16*0] ++ mova m5, [rsp+16*1] ++ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 ++ mova m4, [rsp+16*3] ++ mova m6, [rsp+16*4] ++ mova m5, [rsp+16*5] ++ movd [dstq+strideq*0], m0 ++ pshuflw m1, m0, q1032 ++ movd [dstq+strideq*1], m1 ++ punpckhqdq m0, m0 ++ movd [dstq+strideq*2], m0 ++ psrlq m0, 32 ++ movd [dstq+r3 ], m0 ++ lea dstq, [dstq+strideq*4] ++ sub hd, 4 ++ jg .w4_loop ++ RET ++ALIGN function_align ++.w8: ++ mova m7, [base+ipred_v_shuf] ++ movq m1, [tlq+1] ; left ++ punpcklqdq m1, m1 ++ sub tlq, 4 ++ sub tlq, hq ++ punpcklbw m1, m0 ++ pshufd m6, m7, q0000 ++ pshufd m7, m7, q1111 ++ pmaddubsw m2, m1, m5 ++ paddw m3, m1 ++ paddw m2, m3 ++ mova [rsp+16*0], m1 ++ mova [rsp+16*1], m2 ++ mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; ++ mova [rsp+16*2], m1 ++ mova [rsp+16*3], m4 ++ mova [rsp+16*4], m6 ++ mova [rsp+16*5], m5 ++.w8_loop: ++ movd m1, [tlq+hq] ; left ++ pshufb m1, [base+ipred_h_shuf] ++ pshufd m1, m1, q1100 ++ punpcklbw m0, m1, m4 ++ punpckhbw m1, m4 ++ pmaddubsw m2, m0, m5 ++ pmaddubsw m3, m1, m5 ++ paddw m2, m0 ++ paddw m3, m1 ++ mova m4, [rsp+16*2] ++ pmaddubsw m0, m4 ++ pmaddubsw m1, m4 ++ paddw m2, m0 ++ paddw m3, m1 ++ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; ++ add v_weightsq, 4 ++ pshufb m0, m1, m6 ++ pshufb m1, m7 ++ mova m4, [rsp+16*0] ++ mova m5, [rsp+16*1] ++ SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 ++ mova m4, [rsp+16*3] ++ mova m6, [rsp+16*4] ++ mova m5, [rsp+16*5] ++ movq [dstq+strideq*0], m0 ++ movhps [dstq+strideq*1], m0 ++ lea dstq, [dstq+strideq*2] ++ sub hd, 2 ++ jg .w8_loop ++ RET ++ALIGN function_align ++.w16: ++ mova m7, [base+ipred_v_shuf] ++ movu m1, [tlq+1] ; left ++ sub tlq, 4 ++ sub tlq, hq ++ punpckhbw m6, m1, m0 ; top, bottom ++ punpcklbw m1, m0 ; top, bottom ++ pshufd m7, m7, q0000 ++ mova [rsp+16*2], m7 ++ pmaddubsw m2, m6, m5 ++ mova [rsp+16*5], m6 ++ paddw m6, m3 ; 1 * top + 255 * bottom + 255 ++ paddw m2, m6 ; 128 * top + 129 * bottom + 255 ++ mova [rsp+16*6], m2 ++ pmaddubsw m2, m1, m5 ++ paddw m3, m1 ; 1 * top + 255 * bottom + 255 ++ mova [rsp+16*0], m1 ++ paddw m2, m3 ; 128 * top + 129 * bottom + 255 ++ mova [rsp+16*1], m2 ++ mova [rsp+16*3], m4 ++ mova [rsp+16*4], m5 ++.w16_loop: ++ movd m1, [tlq+hq] ; left ++ pshufb m1, [base+pb_3] ; topleft[-(1 + y)] ++ punpcklbw m1, m4 ; left, right ++ pmaddubsw m2, m1, m5 ; 127 * left - 127 * right ++ paddw m2, m1 ; 128 * left + 129 * right ++ mova m0, m1 ++ mova m3, m2 ++ pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; ++ pmaddubsw m1, [base+smooth_weights+16*3] ++ paddw m2, m0 ++ paddw m3, m1 ++ movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; ++ add v_weightsq, 2 ++ mova m7, [rsp+16*2] ++ pshufb m1, m7 ++ mova [rsp+16*7], m3 ++ mova m4, [rsp+16*0] ++ mova m5, [rsp+16*1] ++ mova m3, [rsp+16*5] ++ mova m7, [rsp+16*6] ++ SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] ++ mova m4, [rsp+16*3] ++ mova m5, [rsp+16*4] ++ mova [dstq], m0 ++ lea dstq, [dstq+strideq] ++ sub hd, 1 ++ jg .w16_loop ++ RET ++ALIGN function_align ++.w32: ++ movu m1, [tlq+1] ; top topleft[1 + x] ++ movu m2, [tlq+17] ; top ++ mova [rsp+16*0], m1 ++ mova [rsp+16*1], m2 ++ sub tlq, 4 ++ sub tlq, hq ++ mova m7, [base+ipred_v_shuf] ++ pshufd m7, m7, q0000 ++ mova [rsp+16*2], m7 ++ mova [rsp+16*3], m0 ++ mova [rsp+16*4], m4 ++ mova [rsp+16*5], m5 ++.w32_loop: ++ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 ++ add dstq, 16 ++ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 ++ lea dstq, [dstq-16+strideq] ++ add v_weightsq, 2 ++ sub hd, 1 ++ jg .w32_loop ++ RET ++ALIGN function_align ++.w64: ++ movu m1, [tlq+1] ; top topleft[1 + x] ++ movu m2, [tlq+17] ; top ++ mova [rsp+16*0], m1 ++ mova [rsp+16*1], m2 ++ movu m1, [tlq+33] ; top ++ movu m2, [tlq+49] ; top ++ mova [rsp+16*11], m1 ++ mova [rsp+16*12], m2 ++ sub tlq, 4 ++ sub tlq, hq ++ mova m7, [base+ipred_v_shuf] ++ pshufd m7, m7, q0000 ++ mova [rsp+16*2], m7 ++ mova [rsp+16*3], m0 ++ mova [rsp+16*4], m4 ++ mova [rsp+16*5], m5 ++.w64_loop: ++ SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 ++ add dstq, 16 ++ SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 ++ add dstq, 16 ++ SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 ++ add dstq, 16 ++ SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 ++ lea dstq, [dstq-48+strideq] ++ add v_weightsq, 2 ++ sub hd, 1 ++ jg .w64_loop ++ RET ++ ++;--------------------------------------------------------------------------------------- ++;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, ++; const uint8_t *idx, const int w, const int h); ++;--------------------------------------------------------------------------------------- ++cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h ++ mova m4, [palq] ++ LEA r2, pal_pred_ssse3_table ++ tzcnt wd, wm ++ movifnidn hd, hm ++ movsxd wq, [r2+wq*4] ++ packuswb m4, m4 ++ add wq, r2 ++ lea r2, [strideq*3] ++ jmp wq ++.w4: ++ pshufb m0, m4, [idxq] ++ add idxq, 16 ++ movd [dstq ], m0 ++ pshuflw m1, m0, q1032 ++ movd [dstq+strideq ], m1 ++ punpckhqdq m0, m0 ++ movd [dstq+strideq*2], m0 ++ psrlq m0, 32 ++ movd [dstq+r2 ], m0 ++ lea dstq, [dstq+strideq*4] ++ sub hd, 4 ++ jg .w4 ++ RET ++ALIGN function_align ++.w8: ++ pshufb m0, m4, [idxq] ++ pshufb m1, m4, [idxq+16] ++ add idxq, 32 ++ movq [dstq ], m0 ++ movhps [dstq+strideq ], m0 ++ movq [dstq+strideq*2], m1 ++ movhps [dstq+r2 ], m1 ++ lea dstq, [dstq+strideq*4] ++ sub hd, 4 ++ jg .w8 ++ RET ++ALIGN function_align ++.w16: ++ pshufb m0, m4, [idxq] ++ pshufb m1, m4, [idxq+16] ++ pshufb m2, m4, [idxq+32] ++ pshufb m3, m4, [idxq+48] ++ add idxq, 64 ++ mova [dstq ], m0 ++ mova [dstq+strideq ], m1 ++ mova [dstq+strideq*2], m2 ++ mova [dstq+r2 ], m3 ++ lea dstq, [dstq+strideq*4] ++ sub hd, 4 ++ jg .w16 ++ RET ++ALIGN function_align ++.w32: ++ pshufb m0, m4, [idxq] ++ pshufb m1, m4, [idxq+16] ++ pshufb m2, m4, [idxq+32] ++ pshufb m3, m4, [idxq+48] ++ add idxq, 64 ++ mova [dstq ], m0 ++ mova [dstq+16 ], m1 ++ mova [dstq+strideq ], m2 ++ mova [dstq+strideq+16], m3 ++ lea dstq, [dstq+strideq*2] ++ sub hd, 2 ++ jg .w32 ++ RET ++ALIGN function_align ++.w64: ++ pshufb m0, m4, [idxq] ++ pshufb m1, m4, [idxq+16] ++ pshufb m2, m4, [idxq+32] ++ pshufb m3, m4, [idxq+48] ++ add idxq, 64 ++ mova [dstq ], m0 ++ mova [dstq+16], m1 ++ mova [dstq+32], m2 ++ mova [dstq+48], m3 ++ add dstq, strideq ++ sub hd, 1 ++ jg .w64 ++ RET ++ ++;--------------------------------------------------------------------------------------- ++;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ++; const int width, const int height, const int16_t *ac, const int alpha); ++;--------------------------------------------------------------------------------------- ++%macro IPRED_CFL 1 ; ac in, unpacked pixels out ++ psignw m3, m%1, m1 ++ pabsw m%1, m%1 ++ pmulhrsw m%1, m2 ++ psignw m%1, m3 ++ paddw m%1, m0 ++%endmacro ++ ++%if UNIX64 ++DECLARE_REG_TMP 7 ++%else ++DECLARE_REG_TMP 5 ++%endif ++ ++cglobal ipred_cfl, 3, 7, 6, dst, stride, tl, w, h, ac, alpha ++ movifnidn wd, wm ++ movifnidn hd, hm ++ tzcnt r6d, hd ++ lea t0d, [wq+hq] ++ movd m4, t0d ++ tzcnt t0d, t0d ++ movd m5, t0d ++ LEA t0, ipred_cfl_ssse3_table ++ tzcnt wd, wd ++ movsxd r6, [t0+r6*4] ++ movsxd wq, [t0+wq*4+16] ++ pcmpeqd m3, m3 ++ psrlw m4, 1 ++ add r6, t0 ++ add wq, t0 ++ movifnidn acq, acmp ++ jmp r6 ++.h4: ++ movd m0, [tlq-4] ++ pmaddubsw m0, m3 ++ jmp wq ++.w4: ++ movd m1, [tlq+1] ++ pmaddubsw m1, m3 ++ psubw m0, m4 ++ paddw m0, m1 ++ pmaddwd m0, m3 ++ cmp hd, 4 ++ jg .w4_mul ++ psrlw m0, 3 ; dc >>= ctz(width + height); ++ jmp .w4_end ++.w4_mul: ++ punpckhqdq m1, m0, m0 ++ paddw m0, m1 ++ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 ++ paddw m0, m1 ++ psrlw m0, 2 ++ mov r6d, 0x5556 ++ mov r2d, 0x3334 ++ test hd, 8 ++ cmovz r6d, r2d ++ movd m5, r6d ++ pmulhuw m0, m5 ++.w4_end: ++ pshuflw m0, m0, q0000 ++ punpcklqdq m0, m0 ++.s4: ++ movd m1, alpham ++ pshuflw m1, m1, q0000 ++ punpcklqdq m1, m1 ++ lea r6, [strideq*3] ++ pabsw m2, m1 ++ psllw m2, 9 ++.s4_loop: ++ mova m4, [acq] ++ mova m5, [acq+16] ++ IPRED_CFL 4 ++ IPRED_CFL 5 ++ packuswb m4, m5 ++ movd [dstq+strideq*0], m4 ++ pshuflw m4, m4, q1032 ++ movd [dstq+strideq*1], m4 ++ punpckhqdq m4, m4 ++ movd [dstq+strideq*2], m4 ++ psrlq m4, 32 ++ movd [dstq+r6 ], m4 ++ lea dstq, [dstq+strideq*4] ++ add acq, 32 ++ sub hd, 4 ++ jg .s4_loop ++ RET ++ALIGN function_align ++.h8: ++ movq m0, [tlq-8] ++ pmaddubsw m0, m3 ++ jmp wq ++.w8: ++ movq m1, [tlq+1] ++ pmaddubsw m1, m3 ++ psubw m4, m0 ++ punpckhqdq m0, m0 ++ psubw m0, m4 ++ paddw m0, m1 ++ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 ++ paddw m0, m1 ++ pmaddwd m0, m3 ++ psrlw m0, m5 ++ cmp hd, 8 ++ je .w8_end ++ mov r6d, 0x5556 ++ mov r2d, 0x3334 ++ cmp hd, 32 ++ cmovz r6d, r2d ++ movd m1, r6d ++ pmulhuw m0, m1 ++.w8_end: ++ pshuflw m0, m0, q0000 ++ punpcklqdq m0, m0 ++.s8: ++ movd m1, alpham ++ pshuflw m1, m1, q0000 ++ punpcklqdq m1, m1 ++ lea r6, [strideq*3] ++ pabsw m2, m1 ++ psllw m2, 9 ++.s8_loop: ++ mova m4, [acq] ++ mova m5, [acq+16] ++ IPRED_CFL 4 ++ IPRED_CFL 5 ++ packuswb m4, m5 ++ movq [dstq ], m4 ++ movhps [dstq+strideq ], m4 ++ mova m4, [acq+32] ++ mova m5, [acq+48] ++ IPRED_CFL 4 ++ IPRED_CFL 5 ++ packuswb m4, m5 ++ movq [dstq+strideq*2], m4 ++ movhps [dstq+r6 ], m4 ++ lea dstq, [dstq+strideq*4] ++ add acq, 64 ++ sub hd, 4 ++ jg .s8_loop ++ RET ++ALIGN function_align ++.h16: ++ mova m0, [tlq-16] ++ pmaddubsw m0, m3 ++ jmp wq ++.w16: ++ movu m1, [tlq+1] ++ pmaddubsw m1, m3 ++ paddw m0, m1 ++ psubw m4, m0 ++ punpckhqdq m0, m0 ++ psubw m0, m4 ++ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 ++ paddw m0, m1 ++ pmaddwd m0, m3 ++ psrlw m0, m5 ++ cmp hd, 16 ++ je .w16_end ++ mov r6d, 0x5556 ++ mov r2d, 0x3334 ++ test hd, 8|32 ++ cmovz r6d, r2d ++ movd m1, r6d ++ pmulhuw m0, m1 ++.w16_end: ++ pshuflw m0, m0, q0000 ++ punpcklqdq m0, m0 ++.s16: ++ movd m1, alpham ++ pshuflw m1, m1, q0000 ++ punpcklqdq m1, m1 ++ pabsw m2, m1 ++ psllw m2, 9 ++.s16_loop: ++ mova m4, [acq] ++ mova m5, [acq+16] ++ IPRED_CFL 4 ++ IPRED_CFL 5 ++ packuswb m4, m5 ++ mova [dstq], m4 ++ mova m4, [acq+32] ++ mova m5, [acq+48] ++ IPRED_CFL 4 ++ IPRED_CFL 5 ++ packuswb m4, m5 ++ mova [dstq+strideq], m4 ++ lea dstq, [dstq+strideq*2] ++ add acq, 64 ++ sub hd, 2 ++ jg .s16_loop ++ RET ++ALIGN function_align ++.h32: ++ mova m0, [tlq-32] ++ pmaddubsw m0, m3 ++ mova m2, [tlq-16] ++ pmaddubsw m2, m3 ++ paddw m0, m2 ++ jmp wq ++.w32: ++ movu m1, [tlq+1] ++ pmaddubsw m1, m3 ++ movu m2, [tlq+17] ++ pmaddubsw m2, m3 ++ paddw m1, m2 ++ paddw m0, m1 ++ psubw m4, m0 ++ punpckhqdq m0, m0 ++ psubw m0, m4 ++ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 ++ paddw m0, m1 ++ pmaddwd m0, m3 ++ psrlw m0, m5 ++ cmp hd, 32 ++ je .w32_end ++ lea r2d, [hq*2] ++ mov r6d, 0x5556 ++ mov r2d, 0x3334 ++ test hd, 64|16 ++ cmovz r6d, r2d ++ movd m1, r6d ++ pmulhuw m0, m1 ++.w32_end: ++ pshuflw m0, m0, q0000 ++ punpcklqdq m0, m0 ++.s32: ++ movd m1, alpham ++ pshuflw m1, m1, q0000 ++ punpcklqdq m1, m1 ++ pabsw m2, m1 ++ psllw m2, 9 ++.s32_loop: ++ mova m4, [acq] ++ mova m5, [acq+16] ++ IPRED_CFL 4 ++ IPRED_CFL 5 ++ packuswb m4, m5 ++ mova [dstq], m4 ++ mova m4, [acq+32] ++ mova m5, [acq+48] ++ IPRED_CFL 4 ++ IPRED_CFL 5 ++ packuswb m4, m5 ++ mova [dstq+16], m4 ++ add dstq, strideq ++ add acq, 64 ++ dec hd ++ jg .s32_loop ++ RET ++ ++;--------------------------------------------------------------------------------------- ++;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ++; const int width, const int height, const int16_t *ac, const int alpha); ++;--------------------------------------------------------------------------------------- ++cglobal ipred_cfl_left, 3, 7, 6, dst, stride, tl, w, h, ac, alpha ++ mov hd, hm ; zero upper half ++ tzcnt r6d, hd ++ sub tlq, hq ++ tzcnt wd, wm ++ movu m0, [tlq] ++ mov t0d, 0x8000 ++ movd m3, t0d ++ movd m2, r6d ++ psrld m3, m2 ++ LEA t0, ipred_cfl_left_ssse3_table ++ movsxd r6, [t0+r6*4] ++ pcmpeqd m2, m2 ++ pmaddubsw m0, m2 ++ add r6, t0 ++ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table ++ movsxd wq, [t0+wq*4] ++ add wq, t0 ++ movifnidn acq, acmp ++ jmp r6 ++.h32: ++ movu m1, [tlq+16] ; unaligned when jumping here from dc_top ++ pmaddubsw m1, m2 ++ paddw m0, m1 ++.h16: ++ pshufd m1, m0, q3232 ; psrlq m1, m0, 16 ++ paddw m0, m1 ++.h8: ++ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 ++ paddw m0, m1 ++.h4: ++ pmaddwd m0, m2 ++ pmulhrsw m0, m3 ++ pshuflw m0, m0, q0000 ++ punpcklqdq m0, m0 ++ jmp wq ++ ++;--------------------------------------------------------------------------------------- ++;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ++; const int width, const int height, const int16_t *ac, const int alpha); ++;--------------------------------------------------------------------------------------- ++cglobal ipred_cfl_top, 3, 7, 6, dst, stride, tl, w, h, ac, alpha ++ LEA t0, ipred_cfl_left_ssse3_table ++ tzcnt wd, wm ++ inc tlq ++ movu m0, [tlq] ++ movifnidn hd, hm ++ mov r6d, 0x8000 ++ movd m3, r6d ++ movd m2, wd ++ psrld m3, m2 ++ movsxd r6, [t0+wq*4] ++ pcmpeqd m2, m2 ++ pmaddubsw m0, m2 ++ add r6, t0 ++ add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table ++ movsxd wq, [t0+wq*4] ++ add wq, t0 ++ movifnidn acq, acmp ++ jmp r6 ++ ++;--------------------------------------------------------------------------------------- ++;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ++; const int width, const int height, const int16_t *ac, const int alpha); ++;--------------------------------------------------------------------------------------- ++cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha ++ tzcnt wd, wm ++ movifnidn hd, hm ++ LEA r6, ipred_cfl_splat_ssse3_table ++ movsxd wq, [r6+wq*4] ++ movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] ++ add wq, r6 ++ movifnidn acq, acmp ++ jmp wq +diff --git third_party/dav1d/src/x86/itx_init_tmpl.c third_party/dav1d/src/x86/itx_init_tmpl.c +index c4aa5bc9bcd1..3758380a5144 100644 +--- third_party/dav1d/src/x86/itx_init_tmpl.c ++++ third_party/dav1d/src/x86/itx_init_tmpl.c +@@ -77,10 +77,15 @@ decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2); + decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2); + decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2); + +-decl_itx17_fns(4, 4, ssse3); +-decl_itx16_fns(4, 8, ssse3); +-decl_itx16_fns(8, 4, ssse3); +-decl_itx16_fns(8, 8, ssse3); ++decl_itx17_fns( 4, 4, ssse3); ++decl_itx16_fns( 4, 8, ssse3); ++decl_itx16_fns( 8, 4, ssse3); ++decl_itx16_fns( 8, 8, ssse3); ++decl_itx16_fns( 4, 16, ssse3); ++decl_itx16_fns(16, 4, ssse3); ++decl_itx16_fns( 8, 16, ssse3); ++decl_itx16_fns(16, 8, ssse3); ++decl_itx12_fns(16, 16, ssse3); + + void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { + #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ +@@ -124,10 +129,15 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + #if BITDEPTH == 8 +- assign_itx17_fn(, 4, 4, ssse3); +- assign_itx16_fn(R, 4, 8, ssse3); +- assign_itx16_fn(R, 8, 4, ssse3); +- assign_itx16_fn(, 8, 8, ssse3); ++ assign_itx17_fn(, 4, 4, ssse3); ++ assign_itx16_fn(R, 4, 8, ssse3); ++ assign_itx16_fn(R, 8, 4, ssse3); ++ assign_itx16_fn(, 8, 8, ssse3); ++ assign_itx16_fn(R, 4, 16, ssse3); ++ assign_itx16_fn(R, 16, 4, ssse3); ++ assign_itx16_fn(R, 8, 16, ssse3); ++ assign_itx16_fn(R, 16, 8, ssse3); ++ assign_itx12_fn(, 16, 16, ssse3); + #endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; +diff --git third_party/dav1d/src/x86/itx_ssse3.asm third_party/dav1d/src/x86/itx_ssse3.asm +index 8e69a3b1928d..f3914d21b9ef 100644 +--- third_party/dav1d/src/x86/itx_ssse3.asm ++++ third_party/dav1d/src/x86/itx_ssse3.asm +@@ -54,15 +54,28 @@ COEF_PAIR 1931, 3612 + COEF_PAIR 3166, 2598 + COEF_PAIR 3920, 1189 + COEF_PAIR 3784, 1567 ++COEF_PAIR 995, 3973 ++COEF_PAIR 1751, 3703 ++COEF_PAIR 3513, 2106 ++COEF_PAIR 3857, 1380 ++COEF_PAIR 4017, 799 ++COEF_PAIR 201, 4091 ++COEF_PAIR 2440, 3290 ++COEF_PAIR 3035, 2751 ++COEF_PAIR 4052, 601 ++COEF_PAIR 2276, 3406 + + pd_2048: times 4 dd 2048 + pw_2048: times 8 dw 2048 ++pw_m2048: times 8 dw -2048 + pw_4096: times 8 dw 4096 + pw_16384: times 8 dw 16384 + pw_m16384: times 8 dw -16384 + pw_2896x8: times 8 dw 2896*8 + pw_3344x8: times 8 dw 3344*8 + pw_5793x4: times 8 dw 5793*4 ++pw_8192: times 8 dw 8192 ++pw_m8192: times 8 dw -8192 + + iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424 + iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568 +@@ -112,18 +125,18 @@ SECTION .text + punpcklbw m%3, m%5 ;extend byte to word + punpcklbw m%4, m%5 ;extend byte to word + +- paddw m%1, m%3 ;high: dst1 + out1 ;low: dst0 + out0 +- paddw m%2, m%4 ;high: dst3 + out3 ;low: dst2 + out2 ++ paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 ++ paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 + +- packuswb m%1, m%2 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 ++ packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 + +- movd [%%row_adr1], m%1 ;store dst0 + out0 +- pshuflw m%2, m%1, q1032 +- movd [%%row_adr2], m%2 ;store dst1 + out1 +- punpckhqdq m%1, m%1 +- movd [%%row_adr3], m%1 ;store dst2 + out2 +- psrlq m%1, 32 +- movd [%%row_adr4], m%1 ;store dst3 + out3 ++ movd [%%row_adr1], m%3 ;store dst0 + out0 ++ pshuflw m%4, m%3, q1032 ++ movd [%%row_adr2], m%4 ;store dst1 + out1 ++ punpckhqdq m%3, m%3 ++ movd [%%row_adr3], m%3 ;store dst2 + out2 ++ psrlq m%3, 32 ++ movd [%%row_adr4], m%3 ;store dst3 + out3 + %endmacro + + %macro ITX4_END 4-5 2048 ; row[1-4], rnd +@@ -709,9 +722,9 @@ cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + ++.pass1: + call m(idct_8x4_internal).main +- call m(iadst_4x8_internal).inversion +- jmp tx2q ++ jmp m(iadst_4x8_internal).pass1_end + + .pass2: + call .main +@@ -738,8 +751,11 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + ++.pass1: + call m(iadst_8x4_internal).main +- call .inversion ++ ++.pass1_end: ++ INV_4X8 + jmp tx2q + + .pass2: +@@ -775,11 +791,6 @@ ALIGN function_align + IADST8_1D_PACKED + ret + +-ALIGN function_align +-.inversion: +- INV_4X8 +- ret +- + INV_TXFM_4X8_FN flipadst, dct, 0 + INV_TXFM_4X8_FN flipadst, adst + INV_TXFM_4X8_FN flipadst, flipadst +@@ -792,6 +803,7 @@ cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + ++.pass1: + call m(iadst_8x4_internal).main + + punpcklwd m4, m3, m2 +@@ -832,6 +844,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + ++.pass1: + mova m5, [o(pw_5793x4)] + paddw m0, m0 + paddw m1, m1 +@@ -842,8 +855,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + +- call m(iadst_4x8_internal).inversion +- jmp tx2q ++ jmp m(iadst_4x8_internal).pass1_end + + .pass2: + mova m4, [o(pw_4096)] +@@ -1137,7 +1149,7 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + jmp m(iadst_8x4_internal).end + + %macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh +- INV_TXFM_FN %1, %2, %3, 8x8, 8 ++ INV_TXFM_FN %1, %2, %3, 8x8, 8, 16*4 + %ifidn %1_%2, dct_identity + mova m0, [o(pw_2896x8)] + pmulhrsw m0, [coeffq] +@@ -1171,14 +1183,15 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + pmulhrsw m0, m1 + pmulhrsw m0, m2 + .end: +- mov r2d, 2 +-.end2: +- lea r3, [strideq*3] ++ mov r3d, 2 ++ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)] + .loop: + WRITE_8X4 0, 0, 0, 0, 1, 2, 3 + lea dstq, [dstq+strideq*2] +- dec r2d ++ dec r3d + jg .loop ++ jmp tx2q ++.end3: + RET + %else ; identity + mova m0, [coeffq+16*0] +@@ -1201,14 +1214,27 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 + %endif + %endmacro + +-%macro ITX_8X8_LOAD_COEFS 0 +- mova m0, [coeffq+16*0] +- mova m1, [coeffq+16*1] +- mova m2, [coeffq+16*2] +- mova m3, [coeffq+16*3] +- mova m4, [coeffq+16*4] +- mova m5, [coeffq+16*5] +- mova m6, [coeffq+16*6] ++%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 ++%if %3 ++ mova m7, [o(pw_2896x8)] ++ pmulhrsw m0, m7, [%1+%2*0] ++ pmulhrsw m1, m7, [%1+%2*1] ++ pmulhrsw m2, m7, [%1+%2*2] ++ pmulhrsw m3, m7, [%1+%2*3] ++ pmulhrsw m4, m7, [%1+%2*4] ++ pmulhrsw m5, m7, [%1+%2*5] ++ pmulhrsw m6, m7, [%1+%2*6] ++ pmulhrsw m7, [%1+%2*7] ++%else ++ mova m0, [%1+%2*0] ++ mova m1, [%1+%2*1] ++ mova m2, [%1+%2*2] ++ mova m3, [%1+%2*3] ++ mova m4, [%1+%2*4] ++ mova m5, [%1+%2*5] ++ mova m6, [%1+%2*6] ++ mova m7, [%1+%2*7] ++%endif + %endmacro + + %macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 +@@ -1231,98 +1257,109 @@ INV_TXFM_8X8_FN dct, adst + INV_TXFM_8X8_FN dct, flipadst + + cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +- ITX_8X8_LOAD_COEFS ++ LOAD_8ROWS coeffq, 16 ++ ++.pass1: + call .main + + .pass1_end: +- mova m7, [o(pw_16384)] +- REPX {pmulhrsw x, m7}, m0, m2, m4, m6 +- mova [coeffq+16*6], m6 ++ mova m7, [o(pw_16384)] ++ ++.pass1_end1: ++ REPX {pmulhrsw x, m7}, m0, m2, m4, m6 ++ mova [rsp+gprsize+16*1], m6 + + .pass1_end2: +- REPX {pmulhrsw x, m7}, m1, m3, m5 +- pmulhrsw m7, [coeffq+16*7] ++ REPX {pmulhrsw x, m7}, m1, m3, m5 ++ pmulhrsw m7, [rsp+gprsize+16*0] + + .pass1_end3: +- punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 +- punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 +- punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 +- punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 +- punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 +- punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 +- punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 +- punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 +- punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 +- punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 +- mova [coeffq+16*5], m6 +- mova m6, [coeffq+16*6] +- punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 +- punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 +- punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 +- punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 +- punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 +- punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 +- +- punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 +- punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 +- mova [coeffq+16*7], m2 +- punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 +- punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 +- punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 +- punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 +- mova m7, [coeffq+16*5] +- punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 +- punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 +- jmp tx2q ++ punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 ++ punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 ++ punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 ++ punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 ++ punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 ++ punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 ++ punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 ++ punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 ++ punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 ++ punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 ++ mova [rsp+gprsize+16*2], m6 ++ mova m6, [rsp+gprsize+16*1] ++ punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 ++ punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 ++ punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 ++ punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 ++ punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 ++ punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 ++ ++ punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 ++ punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 ++ mova [rsp+gprsize+16*0], m2 ++ punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 ++ punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 ++ punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 ++ punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 ++ mova m7, [rsp+gprsize+16*2] ++ punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 ++ punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 ++ mova m7, [rsp+gprsize+16*0] ++ jmp tx2q + + .pass2: ++ lea tx2q, [o(m(idct_8x8_internal).end4)] ++ ++.pass2_main: + call .main + + .end: +- mova m7, [o(pw_2048)] +- REPX {pmulhrsw x, m7}, m0, m2, m4, m6 +- mova [coeffq+16*6], m6 ++ mova m7, [o(pw_2048)] ++ REPX {pmulhrsw x, m7}, m0, m2, m4, m6 ++ mova [rsp+gprsize+16*1], m6 + + .end2: +- REPX {pmulhrsw x, m7}, m1, m3, m5 +- pmulhrsw m7, [coeffq+16*7] +- mova [coeffq+16*5], m5 +- mova [coeffq+16*7], m7 ++ REPX {pmulhrsw x, m7}, m1, m3, m5 ++ pmulhrsw m7, [rsp+gprsize+16*0] ++ mova [rsp+gprsize+16*2], m5 ++ mova [rsp+gprsize+16*0], m7 + + .end3: +- WRITE_8X4 0, 1, 2, 3, 5, 6, 7 +- lea dstq, [dstq+strideq*2] +- WRITE_8X4 4, [coeffq+16*5], [coeffq+16*6], [coeffq+16*7], 5, 6, 7 ++ WRITE_8X4 0, 1, 2, 3, 5, 6, 7 ++ lea dstq, [dstq+strideq*2] ++ WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 ++ jmp tx2q + +- pxor m7, m7 +- REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ++.end4: ++ pxor m7, m7 ++ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + + ALIGN function_align + .main: +- mova [coeffq+16*6], m3 +- mova [coeffq+16*5], m1 +- mova m7, [o(pd_2048)] +- IDCT4_1D 0, 2, 4, 6, 1, 3, 7 +- mova m3, [coeffq+16*5] +- mova [coeffq+16*5], m2 +- mova m2, [coeffq+16*6] +- mova [coeffq+16*6], m4 +- mova m4, [coeffq+16*7] +- mova [coeffq+16*7], m6 +- IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 +- mova m6, [coeffq+16*7] +- psubsw m7, m0, m4 ;out7 +- paddsw m0, m4 ;out0 +- mova [coeffq+16*7], m7 +- mova m1, [coeffq+16*5] +- psubsw m4, m6, m3 ;out4 +- paddsw m3, m6 ;out3 +- mova m7, [coeffq+16*6] +- psubsw m6, m1, m5 ;out6 +- paddsw m1, m5 ;out1 +- psubsw m5, m7, m2 ;out5 +- paddsw m2, m7 ;out2 ++ mova [rsp+gprsize*2+16*0], m7 ++ mova [rsp+gprsize*2+16*1], m3 ++ mova [rsp+gprsize*2+16*2], m1 ++ mova m7, [o(pd_2048)] ++ IDCT4_1D 0, 2, 4, 6, 1, 3, 7 ++ mova m3, [rsp+gprsize*2+16*2] ++ mova [rsp+gprsize*2+16*2], m2 ++ mova m2, [rsp+gprsize*2+16*1] ++ mova [rsp+gprsize*2+16*1], m4 ++ mova m4, [rsp+gprsize*2+16*0] ++ mova [rsp+gprsize*2+16*0], m6 ++ IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 ++ mova m6, [rsp+gprsize*2+16*0] ++ psubsw m7, m0, m4 ;out7 ++ paddsw m0, m4 ;out0 ++ mova [rsp+gprsize*2+16*0], m7 ++ mova m1, [rsp+gprsize*2+16*2] ++ psubsw m4, m6, m3 ;out4 ++ paddsw m3, m6 ;out3 ++ mova m7, [rsp+gprsize*2+16*1] ++ psubsw m6, m1, m5 ;out6 ++ paddsw m1, m5 ;out1 ++ psubsw m5, m7, m2 ;out5 ++ paddsw m2, m7 ;out2 + ret + + +@@ -1332,76 +1369,88 @@ INV_TXFM_8X8_FN adst, flipadst + INV_TXFM_8X8_FN adst, identity + + cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +- ITX_8X8_LOAD_COEFS ++ LOAD_8ROWS coeffq, 16 ++ ++.pass1: + call .main +- mova m7, [o(pw_16384)] +- REPX {pmulhrsw x, m7}, m0, m2, m4, m6 +- mova [coeffq+16*6], m6 +- pxor m6, m6 +- psubw m6, m7 +- mova m7, m6 ++ ++.pass1_end: ++ mova m7, [o(pw_16384)] ++ ++.pass1_end1: ++ REPX {pmulhrsw x, m7}, m0, m2, m4, m6 ++ mova [rsp+gprsize+16*1], m6 ++ pxor m6, m6 ++ psubw m6, m7 ++ mova m7, m6 + jmp m(idct_8x8_internal).pass1_end2 + + ALIGN function_align + .pass2: ++ lea tx2q, [o(m(idct_8x8_internal).end4)] ++ ++.pass2_main: + call .main +- mova m7, [o(pw_2048)] +- REPX {pmulhrsw x, m7}, m0, m2, m4, m6 +- mova [coeffq+16*6], m6 +- pxor m6, m6 +- psubw m6, m7 +- mova m7, m6 ++ ++.end: ++ mova m7, [o(pw_2048)] ++ REPX {pmulhrsw x, m7}, m0, m2, m4, m6 ++ mova [rsp+gprsize+16*1], m6 ++ pxor m6, m6 ++ psubw m6, m7 ++ mova m7, m6 + jmp m(idct_8x8_internal).end2 + + ALIGN function_align + .main: +- mova [coeffq+16*6], m3 +- mova [coeffq+16*5], m4 +- mova m7, [o(pd_2048)] +- ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a +- ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a +- paddsw m3, m2, m6 ;t2 +- psubsw m2, m6 ;t6 +- paddsw m4, m5, m1 ;t3 +- psubsw m5, m1 ;t7 +- ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a +- +- mova m6, [coeffq+16*5] +- mova [coeffq+16*5], m5 +- mova m1, [coeffq+16*6] +- mova [coeffq+16*6], m2 +- mova m5, [coeffq+16*7] +- mova [coeffq+16*7], m3 +- ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a +- ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a +- psubsw m2, m0, m6 ;t4 +- paddsw m0, m6 ;t0 +- paddsw m3, m5, m1 ;t1 +- psubsw m5, m1 ;t5 +- ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a +- +- mova m7, [coeffq+16*7] +- paddsw m1, m3, m4 ;-out7 +- psubsw m3, m4 ;t3 +- mova [coeffq+16*7], m1 +- psubsw m4, m0, m7 ;t2 +- paddsw m0, m7 ;out0 +- mova m6, [coeffq+16*5] +- mova m7, [coeffq+16*6] +- paddsw m1, m5, m6 ;-out1 +- psubsw m5, m6 ;t6 +- paddsw m6, m2, m7 ;out6 +- psubsw m2, m7 ;t7 +- paddw m7, m4, m3 ;t2 + t3 +- psubw m4, m3 ;t2 - t3 +- paddw m3, m5, m2 ;t6 + t7 +- psubw m5, m2 ;t6 - t7 +- mova m2, [o(pw_2896x8)] +- pmulhrsw m4, m2 ;out4 +- pmulhrsw m5, m2 ;-out5 +- pmulhrsw m7, m2 ;-out3 +- pmulhrsw m2, m3 ;out2 +- mova m3, m7 ++ mova [rsp+gprsize*2+16*0], m7 ++ mova [rsp+gprsize*2+16*1], m3 ++ mova [rsp+gprsize*2+16*2], m4 ++ mova m7, [o(pd_2048)] ++ ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a ++ ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a ++ paddsw m3, m2, m6 ;t2 ++ psubsw m2, m6 ;t6 ++ paddsw m4, m5, m1 ;t3 ++ psubsw m5, m1 ;t7 ++ ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a ++ ++ mova m6, [rsp+gprsize*2+16*2] ++ mova [rsp+gprsize*2+16*2], m5 ++ mova m1, [rsp+gprsize*2+16*1] ++ mova [rsp+gprsize*2+16*1], m2 ++ mova m5, [rsp+gprsize*2+16*0] ++ mova [rsp+gprsize*2+16*0], m3 ++ ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a ++ ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a ++ psubsw m2, m0, m6 ;t4 ++ paddsw m0, m6 ;t0 ++ paddsw m3, m5, m1 ;t1 ++ psubsw m5, m1 ;t5 ++ ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a ++ ++ mova m7, [rsp+gprsize*2+16*0] ++ paddsw m1, m3, m4 ;-out7 ++ psubsw m3, m4 ;t3 ++ mova [rsp+gprsize*2+16*0], m1 ++ psubsw m4, m0, m7 ;t2 ++ paddsw m0, m7 ;out0 ++ mova m6, [rsp+gprsize*2+16*2] ++ mova m7, [rsp+gprsize*2+16*1] ++ paddsw m1, m5, m6 ;-out1 ++ psubsw m5, m6 ;t6 ++ paddsw m6, m2, m7 ;out6 ++ psubsw m2, m7 ;t7 ++ paddw m7, m4, m3 ;t2 + t3 ++ psubw m4, m3 ;t2 - t3 ++ paddw m3, m5, m2 ;t6 + t7 ++ psubw m5, m2 ;t6 - t7 ++ mova m2, [o(pw_2896x8)] ++ pmulhrsw m4, m2 ;out4 ++ pmulhrsw m5, m2 ;-out5 ++ pmulhrsw m7, m2 ;-out3 ++ pmulhrsw m2, m3 ;out2 ++ mova m3, m7 + ret + + INV_TXFM_8X8_FN flipadst, dct +@@ -1410,46 +1459,57 @@ INV_TXFM_8X8_FN flipadst, flipadst + INV_TXFM_8X8_FN flipadst, identity + + cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +- ITX_8X8_LOAD_COEFS ++ LOAD_8ROWS coeffq, 16 ++ ++.pass1: + call m(iadst_8x8_internal).main +- mova m7, [o(pw_m16384)] +- pmulhrsw m1, m7 +- mova [coeffq+16*6], m1 +- mova m1, m6 +- mova m6, m2 +- pmulhrsw m2, m5, m7 +- mova m5, m6 +- mova m6, m4 +- pmulhrsw m4, m3, m7 +- mova m3, m6 +- mova m6, m0 +- mova m0, m7 +- pxor m7, m7 +- psubw m7, m0 +- pmulhrsw m0, [coeffq+16*7] +- REPX {pmulhrsw x, m7}, m1, m3, m5 +- pmulhrsw m7, m6 ++ ++.pass1_end: ++ mova m7, [o(pw_m16384)] ++ ++.pass1_end1: ++ pmulhrsw m1, m7 ++ mova [rsp+gprsize+16*1], m1 ++ mova m1, m6 ++ mova m6, m2 ++ pmulhrsw m2, m5, m7 ++ mova m5, m6 ++ mova m6, m4 ++ pmulhrsw m4, m3, m7 ++ mova m3, m6 ++ mova m6, m0 ++ mova m0, m7 ++ pxor m7, m7 ++ psubw m7, m0 ++ pmulhrsw m0, [rsp+gprsize+16*0] ++ REPX {pmulhrsw x, m7}, m1, m3, m5 ++ pmulhrsw m7, m6 + jmp m(idct_8x8_internal).pass1_end3 + + ALIGN function_align + .pass2: ++ lea tx2q, [o(m(idct_8x8_internal).end4)] ++ ++.pass2_main: + call m(iadst_8x8_internal).main +- mova m7, [o(pw_2048)] +- REPX {pmulhrsw x, m7}, m0, m2, m4, m6 +- mova [coeffq+16*5], m2 +- mova m2, m0 +- pxor m0, m0 +- psubw m0, m7 +- mova m7, m2 +- pmulhrsw m1, m0 +- pmulhrsw m2, m5, m0 +- mova [coeffq+16*6], m1 +- mova m5, m4 +- mova m1, m6 +- pmulhrsw m4, m3, m0 +- pmulhrsw m0, [coeffq+16*7] +- mova m3, m5 +- mova [coeffq+16*7], m7 ++ ++.end: ++ mova m7, [o(pw_2048)] ++ REPX {pmulhrsw x, m7}, m0, m2, m4, m6 ++ mova [rsp+gprsize+16*2], m2 ++ mova m2, m0 ++ pxor m0, m0 ++ psubw m0, m7 ++ mova m7, m2 ++ pmulhrsw m1, m0 ++ pmulhrsw m2, m5, m0 ++ mova [rsp+gprsize+16*1], m1 ++ mova m5, m4 ++ mova m1, m6 ++ pmulhrsw m4, m3, m0 ++ pmulhrsw m0, [rsp+gprsize+16*0] ++ mova m3, m5 ++ mova [rsp+gprsize+16*0], m7 + jmp m(idct_8x8_internal).end3 + + INV_TXFM_8X8_FN identity, dct, 7 +@@ -1458,21 +1518,2146 @@ INV_TXFM_8X8_FN identity, flipadst + INV_TXFM_8X8_FN identity, identity + + cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 +- mova m0, [coeffq+16*0] +- mova m1, [coeffq+16*1] +- mova m2, [coeffq+16*2] +- mova m3, [coeffq+16*3] +- mova m4, [coeffq+16*4] +- mova m5, [coeffq+16*5] +- mova m7, [coeffq+16*7] +- jmp m(idct_8x8_internal).pass1_end3 ++ LOAD_8ROWS coeffq, 16 ++ mova [rsp+gprsize+16*1], m6 ++ jmp m(idct_8x8_internal).pass1_end3 + + ALIGN function_align + .pass2: +- mova m7, [o(pw_4096)] ++ lea tx2q, [o(m(idct_8x8_internal).end4)] ++ ++.end: ++ pmulhrsw m7, [o(pw_4096)] ++ mova [rsp+gprsize+16*0], m7 ++ mova m7, [o(pw_4096)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ mova [rsp+gprsize+16*2], m5 ++ mova [rsp+gprsize+16*1], m6 ++ jmp m(idct_8x8_internal).end3 ++ ++ ++%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh ++ INV_TXFM_FN %1, %2, %3, 4x16, 8 ++%if %3 >= 0 ++%ifidn %1_%2, dct_identity ++ mova m0, [o(pw_2896x8)] ++ mova m1, m0 ++ pmulhrsw m0, [coeffq+16*0] ++ pmulhrsw m1, [coeffq+16*1] ++ mova m2, [o(pw_16384)] ++ mova m3, [o(pw_5793x4)] ++ mova m4, [o(pw_2048)] ++ pmulhrsw m0, m2 ++ pmulhrsw m1, m2 ++ psllw m0, 2 ++ psllw m1, 2 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 ++ pmulhrsw m0, m4 ++ pmulhrsw m4, m1 ++ punpckhwd m2, m0, m0 ++ punpcklwd m0, m0 ++ punpckhwd m6, m4, m4 ++ punpcklwd m4, m4 ++ punpckhdq m1, m0, m0 ++ punpckldq m0, m0 ++ punpckhdq m3, m2, m2 ++ punpckldq m2, m2 ++ punpckhdq m5, m4, m4 ++ punpckldq m4, m4 ++ punpckhdq m7, m6, m6 ++ punpckldq m6, m6 ++ mova [coeffq+16*4], m4 ++ TAIL_CALL m(iadst_4x16_internal).end2 ++%elifidn %1_%2, identity_dct ++ movd m0, [coeffq+32*0] ++ punpcklwd m0, [coeffq+32*1] ++ movd m1, [coeffq+32*2] ++ punpcklwd m1, [coeffq+32*3] ++ mova m2, [o(pw_5793x4)] ++ mova m3, [o(pw_16384)] ++ mova m4, [o(pw_2896x8)] ++ punpckldq m0, m1 ++ paddw m0, m0 ++ pmulhrsw m0, m2 ++ pmulhrsw m0, m3 ++ psrlw m3, 3 ; pw_2048 ++ pmulhrsw m0, m4 ++ pmulhrsw m0, m3 ++ punpcklqdq m0, m0 ++ pxor m7, m7 ++ REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3 ++%elifidn %1_%2, dct_dct ++ pshuflw m0, [coeffq], q0000 ++ punpcklwd m0, m0 ++ mova m1, [o(pw_2896x8)] ++ pmulhrsw m0, m1 ++ mov [coeffq], eobd ++ pmulhrsw m0, [o(pw_16384)] ++ pmulhrsw m0, m1 ++ pmulhrsw m0, [o(pw_2048)] ++%else ; adst_dct / flipadst_dct ++ pshuflw m0, [coeffq], q0000 ++ punpcklwd m0, m0 ++%ifidn %1, adst ++ pmulhrsw m0, [o(iadst4_dconly1a)] ++%else ; flipadst ++ pmulhrsw m0, [o(iadst4_dconly1b)] ++%endif ++ mova m1, [o(pw_16384)] ++ mov [coeffq], eobd ++ pmulhrsw m0, m1 ++ psrlw m1, 3 ; pw_2048 ++ pmulhrsw m0, [o(pw_2896x8)] ++ pmulhrsw m0, m1 ++%endif ++.end: ++ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 ++ lea dstq, [dstq+strideq*4] ++ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 ++ lea dstq, [dstq+strideq*4] ++ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 ++ lea dstq, [dstq+strideq*4] ++ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 ++ RET ++%endif ++%endmacro ++ ++INV_TXFM_4X16_FN dct, dct, 0 ++INV_TXFM_4X16_FN dct, identity, 15 ++INV_TXFM_4X16_FN dct, adst ++INV_TXFM_4X16_FN dct, flipadst ++ ++cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ lea r3, [o(m(idct_4x8_internal).pass1)] ++ ++.pass1: ++ mova m0, [coeffq+16*1] ++ mova m1, [coeffq+16*3] ++ mova m2, [coeffq+16*5] ++ mova m3, [coeffq+16*7] ++ push tx2q ++ lea tx2q, [o(m(idct_4x16_internal).pass1_2)] ++ jmp r3 ++ ++.pass1_2: ++ mova [coeffq+16*1], m0 ++ mova [coeffq+16*3], m1 ++ mova [coeffq+16*5], m2 ++ mova [coeffq+16*7], m3 ++ mova m0, [coeffq+16*0] ++ mova m1, [coeffq+16*2] ++ mova m2, [coeffq+16*4] ++ mova m3, [coeffq+16*6] ++ lea tx2q, [o(m(idct_4x16_internal).pass1_end)] ++ jmp r3 ++ ++.pass1_end: ++ pop tx2q ++ ++ mova m4, [coeffq+16*1] ++ mova m5, [coeffq+16*3] ++ mova m6, [coeffq+16*5] ++ mova m7, [o(pw_16384)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ ++ pmulhrsw m7, [coeffq+16*7] ++ mova [coeffq+16*7], m7 ++ jmp tx2q ++ ++.pass2: ++ call m(idct_16x4_internal).main ++ ++.end: ++ mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*7] ++ mova [coeffq+16*4], m4 ++ ++.end1: ++ mova [coeffq+16*5], m5 ++ mova [coeffq+16*6], m6 ++ mov r3, coeffq ++ WRITE_4X8 0, 1, 3, 2 ++ ++ mova m0, [r3+16*4] ++ mova m1, [r3+16*5] ++ mova m2, [r3+16*6] ++ mova m3, m7 ++ lea dstq, [dstq+strideq*4] ++ WRITE_4X8 0, 1, 3, 2 ++ ++.end2: ++ pxor m7, m7 ++ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ++ ret ++ ++INV_TXFM_4X16_FN adst, dct, 0 ++INV_TXFM_4X16_FN adst, adst ++INV_TXFM_4X16_FN adst, flipadst ++INV_TXFM_4X16_FN adst, identity ++ ++cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ lea r3, [o(m(iadst_4x8_internal).pass1)] ++ jmp m(idct_4x16_internal).pass1 ++ ++.pass2: ++ call m(iadst_16x4_internal).main ++ ++ punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 ++ punpckhqdq m4, m5 ;low: out8 high: out10 ++ punpcklqdq m5, m7, m2 ;low: out4 high: out6 ++ punpckhqdq m2, m7 ;low: -out9 high: -out11 ++ mova [coeffq+16*4], m2 ++ mova [coeffq+16*5], m6 ++ mova m2, [coeffq+16*6] ++ mova m6, [coeffq+16*7] ++ punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 ++ punpcklqdq m0, m6 ;low: out0 high: out2 ++ punpckhqdq m6, m3, m2 ;low: out12 high: out14 ++ punpcklqdq m2, m3 ;low: -out1 high: -out3 ++ ++ mova m7, [o(pw_2048)] ++ ++.end1: ++ REPX {pmulhrsw x, m7}, m0, m5, m4, m6 ++ pxor m3, m3 ++ psubw m3, m7 ++ mova m7, [coeffq+16*4] ++ REPX {pmulhrsw x, m3}, m2, m7, m1 ++ pmulhrsw m3, [coeffq+16*5] ++ mova [coeffq+16*7], m5 ++ ++ punpckhqdq m5, m4, m7 ;low: out10 high: out11 ++ punpcklqdq m4, m7 ;low: out8 high: out9 ++ punpckhqdq m7, m6, m1 ;low: out14 high: out15 ++ punpcklqdq m6, m1 ;low: out12 high: out13 ++ punpckhqdq m1, m0, m2 ;low: out2 high: out3 ++ punpcklqdq m0, m2 ;low: out0 high: out1 ++ mova [coeffq+16*4], m4 ++ mova m4, [coeffq+16*7] ++ punpcklqdq m2, m4, m3 ;low: out4 high: out5 ++ punpckhqdq m4, m3 ;low: out6 high: out7 ++ mova m3, m4 ++ ++.end2: + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 ++ mov r3, coeffq ++ WRITE_4X8 0, 1, 2, 3 ++ ++ mova m0, [r3+16*4] ++ mova m1, [r3+16*5] ++ mova m2, [r3+16*6] ++ mova m3, m7 ++ lea dstq, [dstq+strideq*4] ++ WRITE_4X8 0, 1, 2, 3 ++ ++.end3: ++ pxor m7, m7 ++ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ++ ret ++ ++ ++INV_TXFM_4X16_FN flipadst, dct, 0 ++INV_TXFM_4X16_FN flipadst, adst ++INV_TXFM_4X16_FN flipadst, flipadst ++INV_TXFM_4X16_FN flipadst, identity ++ ++cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ lea r3, [o(m(iflipadst_4x8_internal).pass1)] ++ jmp m(idct_4x16_internal).pass1 ++ ++.pass2: ++ call m(iadst_16x4_internal).main ++ ++ punpckhqdq m6, m5, m4 ;low: out5 high: out7 ++ punpcklqdq m4, m5 ;low: -out8 high: -out10 ++ punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 ++ punpcklqdq m2, m7 ;low: out9 high: out11 ++ mova [coeffq+16*4], m2 ++ mova [coeffq+16*5], m6 ++ mova m2, [coeffq+16*6] ++ mova m6, [coeffq+16*7] ++ punpcklqdq m1, m6, m0 ;low: out13 high: out15 ++ punpckhqdq m0, m6 ;low: -out0 high: -out2 ++ punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 ++ punpckhqdq m2, m3 ;low: out1 high: out3 ++ ++ mova m7, [o(pw_m2048)] ++ jmp m(iadst_4x16_internal).end1 ++ ++ ++INV_TXFM_4X16_FN identity, dct, 3 ++INV_TXFM_4X16_FN identity, adst ++INV_TXFM_4X16_FN identity, flipadst ++INV_TXFM_4X16_FN identity, identity ++ ++cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ lea r3, [o(m(iidentity_4x8_internal).pass1)] ++ jmp m(idct_4x16_internal).pass1 ++ ++.pass2: ++ mova m7, [o(pw_5793x4)] ++ REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6 ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ psllw m7, [coeffq+16*7], 2 ++ pmulhrsw m7, [o(pw_5793x4)] + mova [coeffq+16*7], m7 +- jmp m(idct_8x8_internal).end3 ++ ++ mova m7, [o(pw_2048)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ pmulhrsw m7, [coeffq+16*7] ++ mova [coeffq+16*4], m4 ++ jmp m(iadst_4x16_internal).end2 ++ ++ ++%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh ++ INV_TXFM_FN %1, %2, %3, 16x4, 8 ++%if %3 >= 0 ++%ifidn %1_%2, dct_identity ++ mova m3, [o(pw_2896x8)] ++ pmulhrsw m3, [coeffq] ++ mova m0, [o(pw_16384)] ++ pmulhrsw m3, m0 ++ psrlw m0, 3 ; pw_2048 ++ paddw m3, m3 ++ pmulhrsw m3, [o(pw_5793x4)] ++ pmulhrsw m3, m0 ++ punpcklwd m3, m3 ++ pshufd m0, m3, q0000 ++ pshufd m1, m3, q1111 ++ pshufd m2, m3, q2222 ++ pshufd m3, m3, q3333 ++ lea tx2q, [dstq+8] ++ call m(iadst_8x4_internal).end2 ++ add coeffq, 16*4 ++ mov dstq, tx2q ++ TAIL_CALL m(iadst_8x4_internal).end2 ++%elifidn %1_%2, identity_dct ++ mova m5, [o(pw_16384)] ++ mova m6, [o(pw_5793x4)] ++ mova m7, [o(pw_2896x8)] ++ mov r3d, 2 ++.main_loop: ++ mova m0, [coeffq+16*0] ++ mova m1, [coeffq+16*1] ++ mova m2, [coeffq+16*2] ++ mova m3, [coeffq+16*3] ++ punpckhwd m4, m0, m1 ++ punpcklwd m0, m1 ++ punpckhwd m1, m2, m3 ++ punpcklwd m2, m3 ++ punpcklwd m0, m4 ++ punpcklwd m2, m1 ++ punpcklqdq m0, m2 ++ psllw m0, 2 ++ pmulhrsw m0, m6 ++ pmulhrsw m0, m5 ++ psrlw m1, m5, 3 ; pw_2048 ++ pmulhrsw m0, m7 ++ pmulhrsw m0, m1 ++.end: ++ pxor m3, m3 ++ mova [coeffq+16*0], m3 ++ mova [coeffq+16*1], m3 ++ mova [coeffq+16*2], m3 ++ mova [coeffq+16*3], m3 ++ add coeffq, 16*4 ++ lea tx2q, [dstq+8] ++ WRITE_8X4 0, 0, 0, 0, 1, 2, 3 ++ mov dstq, tx2q ++ dec r3d ++ jg .main_loop ++ RET ++%else ++ movd m1, [o(pw_2896x8)] ++ pmulhrsw m0, m1, [coeffq] ++%ifidn %2, dct ++ movd m2, [o(pw_16384)] ++ mov [coeffq], eobd ++ mov r2d, 2 ++ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)] ++.dconly: ++ pmulhrsw m0, m2 ++ movd m2, [o(pw_2048)] ;intentionally rip-relative ++ pmulhrsw m0, m1 ++ pmulhrsw m0, m2 ++ pshuflw m0, m0, q0000 ++ punpcklwd m0, m0 ++ pxor m5, m5 ++.dconly_loop: ++ mova m1, [dstq] ++ mova m3, [dstq+strideq] ++ punpckhbw m2, m1, m5 ++ punpcklbw m1, m5 ++ punpckhbw m4, m3, m5 ++ punpcklbw m3, m5 ++ paddw m2, m0 ++ paddw m1, m0 ++ paddw m4, m0 ++ paddw m3, m0 ++ packuswb m1, m2 ++ packuswb m3, m4 ++ mova [dstq], m1 ++ mova [dstq+strideq], m3 ++ lea dstq, [dstq+strideq*2] ++ dec r2d ++ jg .dconly_loop ++ jmp tx2q ++.end: ++ RET ++%else ; adst / flipadst ++ movd m2, [o(pw_16384)] ++ pmulhrsw m0, m2 ++ pshuflw m0, m0, q0000 ++ punpcklwd m0, m0 ++ mov [coeffq], eobd ++ pmulhrsw m2, m0, [o(iadst4_dconly2b)] ++ pmulhrsw m0, [o(iadst4_dconly2a)] ++ mova m1, [o(pw_2048)] ++ pmulhrsw m0, m1 ++ pmulhrsw m2, m1 ++%ifidn %2, adst ++ punpckhqdq m1, m0, m0 ++ punpcklqdq m0, m0 ++ punpckhqdq m3, m2, m2 ++ punpcklqdq m2, m2 ++%else ; flipadst ++ mova m3, m0 ++ punpckhqdq m0, m2, m2 ++ punpcklqdq m1, m2, m2 ++ punpckhqdq m2, m3, m3 ++ punpcklqdq m3, m3 ++%endif ++ lea tx2q, [dstq+8] ++ call m(iadst_8x4_internal).end3 ++ mov dstq, tx2q ++ TAIL_CALL m(iadst_8x4_internal).end3 ++%endif ++%endif ++%endif ++%endmacro ++ ++%macro LOAD_7ROWS 2 ;src, stride ++ mova m0, [%1+%2*0] ++ mova m1, [%1+%2*1] ++ mova m2, [%1+%2*2] ++ mova m3, [%1+%2*3] ++ mova m4, [%1+%2*4] ++ mova m5, [%1+%2*5] ++ mova m6, [%1+%2*6] ++%endmacro ++ ++%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] ++ punpckhwd m%5, m%4, m%1 ;packed in13 in3 ++ punpcklwd m%1, m%4 ;packed in1 in15 ++ punpcklwd m%6, m%3, m%2 ;packed in9 in7 ++ punpckhwd m%2, m%3 ;packed in5 in11 ++ ++ mova m%7, [o(pd_2048)] ++ ITX_MUL2X_PACK %1, %4, %7, 401, 4076, 1 ;low: t8a high: t15a ++ ITX_MUL2X_PACK %6, %4, %7, 3166, 2598, 1 ;low: t9a high: t14a ++ ITX_MUL2X_PACK %2, %4, %7, 1931, 3612, 1 ;low: t10a high: t13a ++ ITX_MUL2X_PACK %5, %4, %7, 3920, 1189, 1 ;low: t11a high: t12a ++ psubsw m%4, m%1, m%6 ;low: t9 high: t14 ++ paddsw m%1, m%6 ;low: t8 high: t15 ++ psubsw m%3, m%5, m%2 ;low: t10 high: t13 ++ paddsw m%2, m%5 ;low: t11 high: t12 ++ punpcklqdq m%5, m%4, m%3 ;low: t9 high: t10 ++ punpckhqdq m%4, m%3 ;low: t14 high: t13 ++ punpcklwd m%6, m%4, m%5 ;packed t14 t9 ++ punpckhwd m%5, m%4 ;packed t10 t13 ++ pxor m%4, m%4 ++ psubw m%4, m%5 ;packed -t10 -t13 ++ ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a ++ ITX_MUL2X_PACK %4, %3, %7, 3784, 1567 ;low: t10a high: t13a ++ psubsw m%3, m%1, m%2 ;low: t11a high: t12a ++ paddsw m%1, m%2 ;low: t8a high: t15a ++ psubsw m%5, m%6, m%4 ;low: t10 high: t13 ++ paddsw m%6, m%4 ;low: t9 high: t14 ++ mova m%7, [o(pw_2896x8)] ++ punpckhqdq m%4, m%3, m%5 ;low: t12a high: t13 ++ punpcklqdq m%3, m%5 ;low: t11a high: t10 ++ psubw m%2, m%4, m%3 ++ paddw m%3, m%4 ++ pmulhrsw m%2, m%7 ;low: t11 high: t10a ++ pmulhrsw m%3, m%7 ;low: t12 high: t13a ++ punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 ++ punpcklqdq m%1, m%6 ;low: t8a high: t9 ++%endmacro ++ ++INV_TXFM_16X4_FN dct, dct, 0 ++INV_TXFM_16X4_FN dct, adst, 0 ++INV_TXFM_16X4_FN dct, flipadst, 0 ++INV_TXFM_16X4_FN dct, identity, 3 ++ ++cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ LOAD_7ROWS coeffq, 16 ++ call .main ++ ++.pass1_end: ++ punpckhwd m7, m0, m2 ;packed out1, out5 ++ punpcklwd m0, m2 ;packed out0, out4 ++ punpcklwd m2, m1, m3 ;packed out3, out7 ++ punpckhwd m1, m3 ;packed out2, out6 ++ mova [coeffq+16*6], m7 ++ mova m7, [coeffq+16*7] ++ punpckhwd m3, m4, m6 ;packed out9, out13 ++ punpcklwd m4, m6 ;packed out8, out12 ++ punpcklwd m6, m5, m7 ;packed out11, out15 ++ punpckhwd m5, m7 ;packed out10, out14 ++ ++.pass1_end2: ++ mova m7, [o(pw_16384)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ pmulhrsw m7, [coeffq+16*6] ++ mova [coeffq+16*6], m7 ++ ++.pass1_end3: ++ punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high ++ punpcklwd m3, m6 ;packed 9, 10, 13, 15 low ++ punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high ++ punpcklwd m4, m5 ;packed 8, 10, 12, 14 low ++ punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) ++ punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) ++ punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) ++ punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) ++ mova [coeffq+16*7], m3 ++ mova m3, [coeffq+16*6] ++ punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high ++ punpcklwd m3, m2 ;packed 1, 3, 5, 7 low ++ punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high ++ punpcklwd m0, m1 ;packed 0, 2, 4, 6 low ++ punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) ++ punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) ++ punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) ++ punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) ++ jmp tx2q ++ ++.pass2: ++ lea tx2q, [o(m(idct_8x4_internal).pass2)] ++ ++.pass2_end: ++ mova [coeffq+16*4], m4 ++ mova [coeffq+16*5], m5 ++ mova [coeffq+16*6], m6 ++ lea r3, [dstq+8] ++ call tx2q ++ ++ add coeffq, 16*4 ++ mova m0, [coeffq+16*0] ++ mova m1, [coeffq+16*1] ++ mova m2, [coeffq+16*2] ++ mova m3, [coeffq+16*3] ++ mov dstq, r3 ++ jmp tx2q ++ ++ALIGN function_align ++.main: ++ punpckhqdq m7, m0, m1 ;low:in1 high:in3 ++ punpcklqdq m0, m1 ++ punpcklqdq m1, m2, m3 ++ punpckhqdq m3, m2 ;low:in7 high:in5 ++ mova [coeffq+16*4], m7 ++ mova [coeffq+16*5], m3 ++ mova m7, [coeffq+16*7] ++ punpcklqdq m2, m4, m5 ++ punpckhqdq m4, m5 ;low:in9 high:in11 ++ punpcklqdq m3, m6, m7 ++ punpckhqdq m7, m6 ;low:in15 high:in13 ++ mova [coeffq+16*6], m4 ++ IDCT8_1D_PACKED ++ mova m6, [coeffq+16*4] ++ mova m4, [coeffq+16*5] ++ mova m5, [coeffq+16*6] ++ mova [coeffq+16*4], m1 ++ mova [coeffq+16*5], m2 ++ mova [coeffq+16*6], m3 ++ ++ IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 ++ ++ mova m1, [coeffq+16*4] ++ psubsw m3, m0, m7 ;low:out15 high:out14 ++ paddsw m0, m7 ;low:out0 high:out1 ++ psubsw m7, m1, m5 ;low:out12 high:out13 ++ paddsw m1, m5 ;low:out3 high:out2 ++ mova [coeffq+16*7], m3 ++ mova m2, [coeffq+16*5] ++ mova m3, [coeffq+16*6] ++ psubsw m5, m2, m4 ;low:out11 high:out10 ++ paddsw m2, m4 ;low:out4 high:out5 ++ psubsw m4, m3, m6 ;low:out8 high:out9 ++ paddsw m3, m6 ;low:out7 high:out6 ++ mova m6, m7 ++ ret ++ ++INV_TXFM_16X4_FN adst, dct ++INV_TXFM_16X4_FN adst, adst ++INV_TXFM_16X4_FN adst, flipadst ++INV_TXFM_16X4_FN adst, identity ++ ++cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ LOAD_7ROWS coeffq, 16 ++ call .main ++ ++ punpckhwd m6, m7, m0 ;packed -out11, -out15 ++ punpcklwd m0, m7 ;packed out0, out4 ++ punpcklwd m7, m3, m4 ;packed -out3, -out7 ++ punpckhwd m4, m3 ;packed out8, out12 ++ mova m1, [coeffq+16*6] ++ punpcklwd m3, m1, m5 ;packed -out1, -out5 ++ punpckhwd m5, m1 ;packed out10, out14 ++ mova m1, [coeffq+16*7] ++ mova [coeffq+16*6], m3 ++ mova [coeffq+16*7], m7 ++ punpckhwd m3, m2, m1 ;packed -out9, -out13 ++ punpcklwd m1, m2 ;packed out2, out6 ++ ++ mova m7, [o(pw_16384)] ++ ++.pass1_end: ++ REPX {pmulhrsw x, m7}, m0, m1, m4, m5 ++ pxor m2, m2 ++ psubw m2, m7 ++ mova m7, [coeffq+16*6] ++ REPX {pmulhrsw x, m2}, m7, m3, m6 ++ pmulhrsw m2, [coeffq+16*7] ++ mova [coeffq+16*6], m7 ++ jmp m(idct_16x4_internal).pass1_end3 ++ ++.pass2: ++ lea tx2q, [o(m(iadst_8x4_internal).pass2)] ++ jmp m(idct_16x4_internal).pass2_end ++ ++ALIGN function_align ++.main: ++ mova [coeffq+16*6], m0 ++ pshufd m1, m1, q1032 ++ pshufd m2, m2, q1032 ++ punpckhwd m0, m6, m1 ;packed in13, in2 ++ punpcklwd m1, m6 ;packed in3, in12 ++ punpckhwd m6, m5, m2 ;packed in11, in4 ++ punpcklwd m2, m5 ;packed in5, in10 ++ mova m7, [o(pd_2048)] ++ ITX_MUL2X_PACK 0, 5, 7, 995, 3973 ;low:t2 high:t3 ++ ITX_MUL2X_PACK 6, 5, 7, 1751, 3703 ;low:t4 high:t5 ++ ITX_MUL2X_PACK 2, 5, 7, 3513, 2106 ;low:t10 high:t11 ++ ITX_MUL2X_PACK 1, 5, 7, 3857, 1380 ;low:t12 high:t13 ++ psubsw m5, m0, m2 ;low:t10a high:t11a ++ paddsw m0, m2 ;low:t2a high:t3a ++ psubsw m2, m6, m1 ;low:t12a high:t13a ++ paddsw m6, m1 ;low:t4a high:t5a ++ punpcklqdq m1, m5 ++ punpckhwd m1, m5 ;packed t10a, t11a ++ punpcklqdq m5, m2 ++ punpckhwd m2, m5 ;packed t13a, t12a ++ ITX_MUL2X_PACK 1, 5, 7, 3406, 2276 ;low:t10 high:t11 ++ ITX_MUL2X_PACK 2, 5, 7, 4017, 799, 1 ;low:t12 high:t13 ++ mova [coeffq+16*4], m0 ++ mova [coeffq+16*5], m6 ++ mova m0, [coeffq+16*6] ++ mova m6, [coeffq+16*7] ++ pshufd m0, m0, q1032 ++ pshufd m3, m3, q1032 ++ punpckhwd m5, m6, m0 ;packed in15, in0 ++ punpcklwd m0, m6 ;packed in1, in14 ++ punpckhwd m6, m4, m3 ;packed in9, in6 ++ punpcklwd m3, m4 ;packed in7, in8 ++ ITX_MUL2X_PACK 5, 4, 7, 201, 4091 ;low:t0 high:t1 ++ ITX_MUL2X_PACK 6, 4, 7, 2440, 3290 ;low:t6 high:t7 ++ ITX_MUL2X_PACK 3, 4, 7, 3035, 2751 ;low:t8 high:t9 ++ ITX_MUL2X_PACK 0, 4, 7, 4052, 601 ;low:t14 high:t15 ++ psubsw m4, m5, m3 ;low:t8a high:t9a ++ paddsw m5, m3 ;low:t0a high:t1a ++ psubsw m3, m6, m0 ;low:t14a high:t15a ++ paddsw m6, m0 ;low:t6a high:t7a ++ punpcklqdq m0, m4 ++ punpckhwd m0, m4 ;packed t8a, t9a ++ punpcklqdq m4, m3 ++ punpckhwd m3, m4 ;packed t15a, t14a ++ ITX_MUL2X_PACK 0, 4, 7, 799, 4017 ;low:t8 high:t9 ++ ITX_MUL2X_PACK 3, 4, 7, 2276, 3406, 1 ;low:t14 high:t15 ++ psubsw m4, m0, m2 ;low:t12a high:t13a ++ paddsw m0, m2 ;low:t8a high:t9a ++ psubsw m2, m1, m3 ;low:t14a high:t15a ++ paddsw m1, m3 ;low:t10a high:t11a ++ punpcklqdq m3, m4 ++ punpckhwd m3, m4 ;packed t12a, t13a ++ punpcklqdq m4, m2 ++ punpckhwd m2, m4 ;packed t15a, t14a ++ ITX_MUL2X_PACK 3, 4, 7, 1567, 3784 ;low:t12 high:t13 ++ ITX_MUL2X_PACK 2, 4, 7, 3784, 1567, 1 ;low:t14 high:t15 ++ psubsw m4, m0, m1 ;low:t10 high:t11 ++ paddsw m0, m1 ;low:-out1 high:out14 ++ psubsw m1, m3, m2 ;low:t14a high:t15a ++ paddsw m3, m2 ;low:out2 high:-out13 ++ punpckhqdq m2, m4, m1 ;low:t11 high:t15a ++ punpcklqdq m4, m1 ;low:t10 high:t14a ++ psubw m1, m4, m2 ++ paddw m2, m4 ++ mova [coeffq+16*6], m0 ++ mova [coeffq+16*7], m3 ++ mova m0, [coeffq+16*4] ++ mova m3, [coeffq+16*5] ++ psubsw m4, m5, m3 ;low:t4 high:t5 ++ paddsw m5, m3 ;low:t0 high:t1 ++ psubsw m3, m0 ,m6 ;low:t6 high:t7 ++ paddsw m0, m6 ;low:t2 high:t3 ++ punpcklqdq m6, m4 ++ punpckhwd m6, m4 ;packed t4, t5 ++ punpcklqdq m4, m3 ++ punpckhwd m3, m4 ;packed t7, t6 ++ ITX_MUL2X_PACK 6, 4, 7, 1567, 3784 ;low:t4a high:t5a ++ ITX_MUL2X_PACK 3, 4, 7, 3784, 1567, 1 ;low:t6a high:t7a ++ psubsw m4, m5, m0 ;low:t2a high:t3a ++ paddsw m0, m5 ;low:out0 high:-out15 ++ psubsw m5, m6, m3 ;low:t6 high:t7 ++ paddsw m3, m6 ;low:-out3 high:out12 ++ mova m7, [o(pw_2896x8)] ++ punpckhqdq m6, m4, m5 ;low:t3a high:t7 ++ punpcklqdq m4, m5 ;low:t2a high:t6 ++ psubw m5, m4, m6 ++ paddw m4, m6 ++ pmulhrsw m1, m7 ;low:-out9 high:out10 ++ pmulhrsw m2, m7 ;low:out6 high:-out5 ++ pmulhrsw m5, m7 ;low:out8 high:-out11 ++ pmulhrsw m4, m7 ;low:-out7 high:out4 ++ punpckhqdq m7, m4, m5 ;low:out4 high:-out11 ++ punpcklqdq m4, m5 ;low:-out7 high:out8 ++ punpckhqdq m5, m2, m1 ;low:-out5 high:out10 ++ punpcklqdq m2, m1 ;low:out6 high:-out9 ++ ret ++ ++ ++INV_TXFM_16X4_FN flipadst, dct ++INV_TXFM_16X4_FN flipadst, adst ++INV_TXFM_16X4_FN flipadst, flipadst ++INV_TXFM_16X4_FN flipadst, identity ++ ++cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ LOAD_7ROWS coeffq, 16 ++ call m(iadst_16x4_internal).main ++ ++ punpcklwd m6, m7, m0 ;packed out11, out15 ++ punpckhwd m0, m7 ;packed -out0, -out4 ++ punpckhwd m7, m3, m4 ;packed out3, out7 ++ punpcklwd m4, m3 ;packed -out8, -out12 ++ mova m1, [coeffq+16*6] ++ punpckhwd m3, m1, m5 ;packed out1, out5 ++ punpcklwd m5, m1 ;packed -out10, -out14 ++ mova m1, [coeffq+16*7] ++ mova [coeffq+16*6], m3 ++ mova [coeffq+16*7], m7 ++ punpcklwd m3, m2, m1 ;packed out9, out13 ++ punpckhwd m1, m2 ;packed -out2, -out6 ++ ++ mova m7, [o(pw_m16384)] ++ jmp m(iadst_16x4_internal).pass1_end ++ ++.pass2: ++ lea tx2q, [o(m(iflipadst_8x4_internal).pass2)] ++ jmp m(idct_16x4_internal).pass2_end ++ ++ ++INV_TXFM_16X4_FN identity, dct, 15 ++INV_TXFM_16X4_FN identity, adst ++INV_TXFM_16X4_FN identity, flipadst ++INV_TXFM_16X4_FN identity, identity ++ ++cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ LOAD_7ROWS coeffq, 16 ++ mova m7, [o(pw_5793x4)] ++ REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6 ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ punpckhwd m7, m0, m2 ;packed out1, out5 ++ punpcklwd m0, m2 ;packed out0, out4 ++ punpckhwd m2, m1, m3 ;packed out3, out7 ++ punpcklwd m1, m3 ;packed out2, out6 ++ mova [coeffq+16*6], m7 ++ psllw m7, [coeffq+16*7], 2 ++ pmulhrsw m7, [o(pw_5793x4)] ++ punpckhwd m3, m4, m6 ;packed out9, out13 ++ punpcklwd m4, m6 ;packed out8, out12 ++ punpckhwd m6, m5, m7 ;packed out11, out15 ++ punpcklwd m5, m7 ;packed out10, out14 ++ jmp m(idct_16x4_internal).pass1_end2 ++ ++.pass2: ++ lea tx2q, [o(m(iidentity_8x4_internal).pass2)] ++ jmp m(idct_16x4_internal).pass2_end ++ ++ ++%macro SAVE_8ROWS 2 ;src, stride ++ mova [%1+%2*0], m0 ++ mova [%1+%2*1], m1 ++ mova [%1+%2*2], m2 ++ mova [%1+%2*3], m3 ++ mova [%1+%2*4], m4 ++ mova [%1+%2*5], m5 ++ mova [%1+%2*6], m6 ++ mova [%1+%2*7], m7 ++%endmacro ++ ++%macro ITX_8X16_LOAD_STACK_COEFS 0 ++ mova m0, [rsp+gprsize+16*3] ++ mova m1, [rsp+gprsize+16*4] ++ mova m2, [rsp+gprsize+16*5] ++ mova m3, [rsp+gprsize+16*6] ++ mova m4, [rsp+gprsize+16*7] ++ mova m5, [rsp+gprsize+16*8] ++ mova m6, [rsp+gprsize+16*9] ++ mova m7, [rsp+gprsize+32*5] ++%endmacro ++ ++%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh ++ INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*12 ++%ifidn %1_%2, dct_dct ++ pshuflw m0, [coeffq], q0000 ++ punpcklwd m0, m0 ++ mova m1, [o(pw_2896x8)] ++ pmulhrsw m0, m1 ++ mova m2, [o(pw_16384)] ++ mov [coeffq], eobd ++ pmulhrsw m0, m1 ++ pmulhrsw m0, m2 ++ psrlw m2, 3 ; pw_2048 ++ pmulhrsw m0, m1 ++ pmulhrsw m0, m2 ++ mov r3d, 4 ++ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)] ++ jmp m(inv_txfm_add_dct_dct_8x8).loop ++.end: ++ RET ++%elifidn %1_%2, dct_identity ++ mov r3d, 2 ++.loop: ++ mova m0, [o(pw_2896x8)] ++ pmulhrsw m7, m0, [coeffq] ++ mova m1, [o(pw_16384)] ++ pxor m2, m2 ++ mova [coeffq], m2 ++ pmulhrsw m7, m0 ++ pmulhrsw m7, m1 ++ psrlw m1, 3 ; pw_2048 ++ psllw m7, 2 ++ pmulhrsw m7, [o(pw_5793x4)] ++ pmulhrsw m7, m1 ++ punpcklwd m0, m7, m7 ++ punpckhwd m7, m7 ++ pshufd m3, m0, q3333 ++ pshufd m2, m0, q2222 ++ pshufd m1, m0, q1111 ++ pshufd m0, m0, q0000 ++ call m(iadst_8x4_internal).end3 ++ pshufd m3, m7, q3333 ++ pshufd m2, m7, q2222 ++ pshufd m1, m7, q1111 ++ pshufd m0, m7, q0000 ++ lea dstq, [dstq+strideq*2] ++ call m(iadst_8x4_internal).end3 ++ ++ add coeffq, 16 ++ lea dstq, [dstq+strideq*2] ++ dec r3d ++ jg .loop ++ RET ++%elifidn %1_%2, identity_dct ++ movd m0, [coeffq+32*0] ++ punpcklwd m0, [coeffq+32*1] ++ movd m2, [coeffq+32*2] ++ punpcklwd m2, [coeffq+32*3] ++ add coeffq, 32*4 ++ movd m1, [coeffq+32*0] ++ punpcklwd m1, [coeffq+32*1] ++ movd m3, [coeffq+32*2] ++ punpcklwd m3, [coeffq+32*3] ++ mova m4, [o(pw_2896x8)] ++ xor eobd, eobd ++ mov [coeffq-32*4], eobd ++ mov [coeffq-32*3], eobd ++ mov [coeffq-32*2], eobd ++ mov [coeffq-32*1], eobd ++ punpckldq m0, m2 ++ punpckldq m1, m3 ++ punpcklqdq m0, m1 ++ pmulhrsw m0, m4 ++ pmulhrsw m0, m4 ++ pmulhrsw m0, [o(pw_2048)] ++ mov [coeffq+32*0], eobd ++ mov [coeffq+32*1], eobd ++ mov [coeffq+32*2], eobd ++ mov [coeffq+32*3], eobd ++ mov r3d, 4 ++ lea tx2q, [o(m(inv_txfm_add_identity_dct_8x16).end)] ++ jmp m(inv_txfm_add_dct_dct_8x8).loop ++.end: ++ RET ++%endif ++%endmacro ++ ++INV_TXFM_8X16_FN dct, dct, 0 ++INV_TXFM_8X16_FN dct, identity, 15 ++INV_TXFM_8X16_FN dct, adst ++INV_TXFM_8X16_FN dct, flipadst ++ ++cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ lea r3, [o(m(idct_8x8_internal).pass1)] ++ ++.pass1: ++ LOAD_8ROWS coeffq+16*1, 32, 1 ++ mov [rsp+gprsize+16*11], tx2q ++ lea tx2q, [o(m(idct_8x16_internal).pass1_end)] ++ jmp r3 ++ ++.pass1_end: ++ SAVE_8ROWS coeffq+16*1, 32 ++ LOAD_8ROWS coeffq+16*0, 32, 1 ++ mov tx2q, [rsp+gprsize+16*11] ++ jmp r3 ++ ++.pass2: ++ lea tx2q, [o(m(idct_8x16_internal).end)] ++ ++.pass2_pre: ++ mova [coeffq+16*2 ], m1 ++ mova [coeffq+16*6 ], m3 ++ mova [coeffq+16*10], m5 ++ mova [coeffq+16*14], m7 ++ mova m1, m2 ++ mova m2, m4 ++ mova m3, m6 ++ mova m4, [coeffq+16*1 ] ++ mova m5, [coeffq+16*5 ] ++ mova m6, [coeffq+16*9 ] ++ mova m7, [coeffq+16*13] ++ ++.pass2_main: ++ call m(idct_8x8_internal).main ++ ++ mova [rsp+gprsize+16*3], m0 ++ mova [rsp+gprsize+16*4], m1 ++ mova [rsp+gprsize+16*5], m2 ++ mova [rsp+gprsize+16*6], m3 ++ mova [rsp+gprsize+16*7], m4 ++ mova [rsp+gprsize+16*8], m5 ++ mova [rsp+gprsize+16*9], m6 ++ ++ mova m0, [coeffq+16*2 ] ++ mova m1, [coeffq+16*6 ] ++ mova m2, [coeffq+16*10] ++ mova m3, [coeffq+16*14] ++ mova m4, [coeffq+16*3 ] ++ mova m5, [coeffq+16*7 ] ++ mova m6, [coeffq+16*11] ++ mova m7, [coeffq+16*15] ++ call m(idct_16x8_internal).main ++ ++ mov r3, dstq ++ lea dstq, [dstq+strideq*8] ++ jmp m(idct_8x8_internal).end ++ ++.end: ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ mov dstq, r3 ++ jmp m(idct_8x8_internal).end ++ ++.end1: ++ pxor m7, m7 ++ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++ ret ++ ++INV_TXFM_8X16_FN adst, dct ++INV_TXFM_8X16_FN adst, adst ++INV_TXFM_8X16_FN adst, flipadst ++INV_TXFM_8X16_FN adst, identity ++ ++cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ lea r3, [o(m(iadst_8x8_internal).pass1)] ++ jmp m(idct_8x16_internal).pass1 ++ ++.pass2: ++ lea tx2q, [o(m(iadst_8x16_internal).end)] ++ ++.pass2_pre: ++ mova [rsp+gprsize+16*7], m0 ++ mova [rsp+gprsize+16*8], m1 ++ mova [rsp+gprsize+16*5], m6 ++ mova [rsp+gprsize+16*6], m7 ++ mova m0, m2 ++ mova m1, m3 ++ mova m2, m4 ++ mova m3, m5 ++ ++.pass2_main: ++ mova m4, [coeffq+16*1 ] ++ mova m5, [coeffq+16*3 ] ++ mova m6, [coeffq+16*13] ++ mova m7, [coeffq+16*15] ++ mova [rsp+gprsize+16*3], m4 ++ mova [rsp+gprsize+16*4], m5 ++ mova [rsp+gprsize+16*9], m6 ++ mova [rsp+gprsize+32*5], m7 ++ mova m4, [coeffq+16*5 ] ++ mova m5, [coeffq+16*7 ] ++ mova m6, [coeffq+16*9 ] ++ mova m7, [coeffq+16*11] ++ ++ call m(iadst_16x8_internal).main ++ ++ mov r3, dstq ++ lea dstq, [dstq+strideq*8] ++ jmp m(iadst_8x8_internal).end ++ ++.end: ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ mov dstq, r3 ++ jmp m(iadst_8x8_internal).end ++ ++ ++INV_TXFM_8X16_FN flipadst, dct ++INV_TXFM_8X16_FN flipadst, adst ++INV_TXFM_8X16_FN flipadst, flipadst ++INV_TXFM_8X16_FN flipadst, identity ++ ++cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ lea r3, [o(m(iflipadst_8x8_internal).pass1)] ++ jmp m(idct_8x16_internal).pass1 ++ ++.pass2: ++ lea tx2q, [o(m(iflipadst_8x16_internal).end)] ++ lea r3, [dstq+strideq*8] ++ ++.pass2_pre: ++ mova [rsp+gprsize+16*7], m0 ++ mova [rsp+gprsize+16*8], m1 ++ mova [rsp+gprsize+16*5], m6 ++ mova [rsp+gprsize+16*6], m7 ++ mova m0, m2 ++ mova m1, m3 ++ mova m2, m4 ++ mova m3, m5 ++ ++.pass2_main: ++ mova m4, [coeffq+16*1 ] ++ mova m5, [coeffq+16*3 ] ++ mova m6, [coeffq+16*13] ++ mova m7, [coeffq+16*15] ++ mova [rsp+gprsize+16*3], m4 ++ mova [rsp+gprsize+16*4], m5 ++ mova [rsp+gprsize+16*9], m6 ++ mova [rsp+gprsize+32*5], m7 ++ mova m4, [coeffq+16*5 ] ++ mova m5, [coeffq+16*7 ] ++ mova m6, [coeffq+16*9 ] ++ mova m7, [coeffq+16*11] ++ ++ call m(iadst_16x8_internal).main ++ jmp m(iflipadst_8x8_internal).end ++ ++.end: ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ mov dstq, r3 ++ jmp m(iflipadst_8x8_internal).end ++ ++ ++INV_TXFM_8X16_FN identity, dct, 7 ++INV_TXFM_8X16_FN identity, adst ++INV_TXFM_8X16_FN identity, flipadst ++INV_TXFM_8X16_FN identity, identity ++ ++cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ LOAD_8ROWS coeffq+16*1, 32, 1 ++ mov r3, tx2q ++ lea tx2q, [o(m(iidentity_8x16_internal).pass1_end)] ++ mova [rsp+gprsize+16*1], m6 ++ jmp m(idct_8x8_internal).pass1_end3 ++ ++.pass1_end: ++ SAVE_8ROWS coeffq+16*1, 32 ++ LOAD_8ROWS coeffq+16*0, 32, 1 ++ mov tx2q, r3 ++ mova [rsp+gprsize+16*1], m6 ++ jmp m(idct_8x8_internal).pass1_end3 ++ ++.pass2: ++ lea tx2q, [o(m(iidentity_8x16_internal).end1)] ++ ++.end: ++ REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 ++ pmulhrsw m7, [o(pw_5793x4)] ++ pmulhrsw m7, [o(pw_2048)] ++ mova [rsp+gprsize+16*0], m7 ++ mova m7, [o(pw_5793x4)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ mova m7, [o(pw_2048)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ mova [rsp+gprsize+16*1], m6 ++ mova [rsp+gprsize+16*2], m5 ++ jmp m(idct_8x8_internal).end3 ++ ++.end1: ++ LOAD_8ROWS coeffq+16*1, 32 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ lea dstq, [dstq+strideq*2] ++ jmp .end ++ ++ ++%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh ++ INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*12 ++%ifidn %1_%2, dct_dct ++ movd m1, [o(pw_2896x8)] ++ pmulhrsw m0, m1, [coeffq] ++ movd m2, [o(pw_16384)] ++ mov [coeffq], eobd ++ pmulhrsw m0, m1 ++ mov r2d, 4 ++ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8).end)] ++ jmp m(inv_txfm_add_dct_dct_16x4).dconly ++.end: ++ RET ++%elifidn %1_%2, dct_identity ++ mova m7, [coeffq] ++ mova m0, [o(pw_2896x8)] ++ mova m1, [o(pw_16384)] ++ pxor m2, m2 ++ mova [coeffq], m2 ++ pmulhrsw m7, m0 ++ pmulhrsw m7, m0 ++ pmulhrsw m7, m1 ++ psrlw m1, 2 ; pw_4096 ++ pmulhrsw m7, m1 ++ punpcklwd m3, m7, m7 ++ punpckhwd m7, m7 ++ pshufd m0, m3, q0000 ++ pshufd m1, m3, q1111 ++ pshufd m2, m3, q2222 ++ pshufd m3, m3, q3333 ++ lea r3, [dstq+strideq*4] ++ lea tx2q, [dstq+8] ++ call m(iadst_8x4_internal).end2 ++ add coeffq, 16*4 ++ mov dstq, tx2q ++ call m(iadst_8x4_internal).end2 ++ mov dstq, r3 ++ add coeffq, 16*4 ++ pshufd m0, m7, q0000 ++ pshufd m1, m7, q1111 ++ pshufd m2, m7, q2222 ++ pshufd m3, m7, q3333 ++ lea tx2q, [dstq+8] ++ call m(iadst_8x4_internal).end2 ++ add coeffq, 16*4 ++ mov dstq, tx2q ++ TAIL_CALL m(iadst_8x4_internal).end2 ++%elifidn %1_%2, identity_dct ++ mova m5, [o(pw_16384)] ++ mova m6, [o(pw_5793x4)] ++ mova m7, [o(pw_2896x8)] ++ pxor m4, m4 ++ mov r3d, 2 ++.main_loop: ++ mova m0, [coeffq+16*0] ++ punpcklwd m0, [coeffq+16*1] ++ mova m1, [coeffq+16*2] ++ punpcklwd m1, [coeffq+16*3] ++ mova m2, [coeffq+16*4] ++ punpcklwd m2, [coeffq+16*5] ++ mova m3, [coeffq+16*6] ++ punpcklwd m3, [coeffq+16*7] ++ punpckldq m0, m1 ++ punpckldq m2, m3 ++ punpcklqdq m0, m2 ++ pmulhrsw m0, m7 ++ psllw m0, 2 ++ pmulhrsw m0, m6 ++ pmulhrsw m0, m5 ++ psrlw m1, m5, 3 ; pw_2048 ++ pmulhrsw m0, m7 ++ pmulhrsw m0, m1 ++.end: ++ REPX {mova [coeffq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 ++ add coeffq, 16*8 ++ lea tx2q, [dstq+8] ++ WRITE_8X4 0, 0, 0, 0, 1, 2, 3 ++ lea dstq, [dstq+strideq*2] ++ WRITE_8X4 0, 0, 0, 0, 1, 2, 3 ++ mov dstq, tx2q ++ dec r3d ++ jg .main_loop ++ RET ++%endif ++%endmacro ++ ++INV_TXFM_16X8_FN dct, dct, 0 ++INV_TXFM_16X8_FN dct, identity, 7 ++INV_TXFM_16X8_FN dct, adst ++INV_TXFM_16X8_FN dct, flipadst ++ ++cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ LOAD_8ROWS coeffq+16*0, 32, 1 ++ call m(idct_8x8_internal).main ++ mova [rsp+gprsize+16*3], m0 ++ mova [rsp+gprsize+16*4], m1 ++ mova [rsp+gprsize+16*5], m2 ++ mova [rsp+gprsize+16*6], m3 ++ mova [rsp+gprsize+16*7], m4 ++ mova [rsp+gprsize+16*8], m5 ++ mova [rsp+gprsize+16*9], m6 ++ ++ LOAD_8ROWS coeffq+16*1, 32, 1 ++ call .main ++ mov r3, tx2q ++ lea tx2q, [o(m(idct_16x8_internal).pass1_end)] ++ jmp m(idct_8x8_internal).pass1_end ++ ++.pass1_end: ++ SAVE_8ROWS coeffq+16*1, 32 ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ mov tx2q, r3 ++ jmp m(idct_8x8_internal).pass1_end ++ ++.pass2: ++ lea tx2q, [o(m(idct_16x8_internal).end)] ++ lea r3, [dstq+8] ++ jmp m(idct_8x8_internal).pass2_main ++ ++.end: ++ LOAD_8ROWS coeffq+16*1, 32 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ mov dstq, r3 ++ jmp m(idct_8x8_internal).pass2_main ++ ++ ++ALIGN function_align ++.main: ++ mova [rsp+gprsize*2+16*1], m2 ++ mova [rsp+gprsize*2+16*2], m6 ++ mova [rsp+gprsize*2+32*5], m5 ++ ++ mova m6, [o(pd_2048)] ++ ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a ++ ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a ++ psubsw m2, m0, m4 ;t9 ++ paddsw m0, m4 ;t8 ++ psubsw m4, m7, m3 ;t14 ++ paddsw m7, m3 ;t15 ++ ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a ++ mova m3, [rsp+gprsize*2+16*1] ++ mova m5, [rsp+gprsize*2+32*5] ++ mova [rsp+gprsize*2+16*1], m2 ++ mova [rsp+gprsize*2+32*5], m4 ++ mova m2, [rsp+gprsize*2+16*2] ++ mova [rsp+gprsize*2+16*2], m7 ++ ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a ++ ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a ++ pxor m4, m4 ++ psubsw m7, m2, m3 ;t10 ++ paddsw m2, m3 ;t11 ++ psubsw m3, m1, m5 ;t13 ++ paddsw m1, m5 ;t12 ++ psubw m4, m7 ++ ITX_MULSUB_2W 4, 3, 7, 5, 6, 1567, 3784 ;t10a, t13a ++ mova m7, [rsp+gprsize*2+32*5] ++ psubsw m6, m0, m2 ;t11a ++ paddsw m0, m2 ;t8a ++ paddsw m2, m7, m4 ;t9 ++ psubsw m7, m4 ;t10 ++ mova m5, [rsp+gprsize*2+16*0] ++ psubsw m4, m5, m0 ;out8 ++ paddsw m0, m5 ;out7 ++ mova [rsp+gprsize*2+32*5], m0 ++ mova m5, [rsp+gprsize*2+16*9] ++ psubsw m0, m5, m2 ;out9 ++ paddsw m2, m5 ;out6 ++ mova [rsp+gprsize*2+16*0], m0 ++ mova [rsp+gprsize*2+16*9], m2 ++ mova m0, [rsp+gprsize*2+16*1] ++ mova m2, [rsp+gprsize*2+16*2] ++ mova [rsp+gprsize*2+16*1], m4 ++ psubsw m4, m0, m3 ;t13 ++ paddsw m0, m3 ;t14 ++ psubsw m3, m2, m1 ;t12a ++ paddsw m1, m2 ;t15a ++ mova m5, [o(pw_2896x8)] ++ psubw m2, m4, m7 ;t13-t10 ++ paddw m7, m4 ;t13+t10 ++ psubw m4, m3, m6 ;t12a-t11a ++ paddw m6, m3 ;t12a+t11a ++ pmulhrsw m7, m5 ;t13a ++ pmulhrsw m4, m5 ;t11 ++ pmulhrsw m6, m5 ;t12 ++ pmulhrsw m5, m2 ;t10a ++ mova m3, [rsp+gprsize*2+16*8] ++ psubsw m2, m3, m5 ;out10 ++ paddsw m3, m5 ;out5 ++ mova m5, [rsp+gprsize*2+16*7] ++ mova [rsp+gprsize*2+16*8], m3 ++ psubsw m3, m5, m4 ;out11 ++ paddsw m5, m4 ;out4 ++ mova m4, [rsp+gprsize*2+16*6] ++ mova [rsp+gprsize*2+16*7], m5 ++ paddsw m5, m4, m6 ;out3 ++ psubsw m4, m6 ;out12 ++ mova m6, [rsp+gprsize*2+16*5] ++ mova [rsp+gprsize*2+16*6], m5 ++ psubsw m5, m6, m7 ;out13 ++ paddsw m6, m7 ;out2 ++ mova m7, [rsp+gprsize*2+16*4] ++ mova [rsp+gprsize*2+16*5], m6 ++ psubsw m6, m7, m0 ;out14 ++ paddsw m7, m0 ;out1 ++ mova m0, [rsp+gprsize*2+16*3] ++ mova [rsp+gprsize*2+16*4], m7 ++ psubsw m7, m0, m1 ;out15 ++ paddsw m0, m1 ;out0 ++ mova [rsp+gprsize*2+16*3], m0 ++ mova m1, [rsp+gprsize*2+16*0] ++ mova m0, [rsp+gprsize*2+16*1] ++ mova [rsp+gprsize*2+16*0], m7 ++ ret ++ ++INV_TXFM_16X8_FN adst, dct ++INV_TXFM_16X8_FN adst, adst ++INV_TXFM_16X8_FN adst, flipadst ++INV_TXFM_16X8_FN adst, identity ++ ++cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ mova m7, [o(pw_2896x8)] ++ pmulhrsw m0, m7, [coeffq+16*0 ] ++ pmulhrsw m1, m7, [coeffq+16*1 ] ++ pmulhrsw m2, m7, [coeffq+16*14] ++ pmulhrsw m3, m7, [coeffq+16*15] ++ mova [rsp+gprsize+16*7], m0 ++ mova [rsp+gprsize+16*8], m1 ++ mova [rsp+gprsize+16*9], m2 ++ mova [rsp+gprsize+32*5], m3 ++ pmulhrsw m0, m7, [coeffq+16*6 ] ++ pmulhrsw m1, m7, [coeffq+16*7 ] ++ pmulhrsw m2, m7, [coeffq+16*8 ] ++ pmulhrsw m3, m7, [coeffq+16*9 ] ++ mova [rsp+gprsize+16*3], m2 ++ mova [rsp+gprsize+16*4], m3 ++ mova [rsp+gprsize+16*5], m0 ++ mova [rsp+gprsize+16*6], m1 ++ pmulhrsw m0, m7, [coeffq+16*2 ] ++ pmulhrsw m1, m7, [coeffq+16*3 ] ++ pmulhrsw m2, m7, [coeffq+16*4 ] ++ pmulhrsw m3, m7, [coeffq+16*5 ] ++ pmulhrsw m4, m7, [coeffq+16*10] ++ pmulhrsw m5, m7, [coeffq+16*11] ++ pmulhrsw m6, m7, [coeffq+16*12] ++ pmulhrsw m7, [coeffq+16*13] ++ ++ call .main ++ mov r3, tx2q ++ lea tx2q, [o(m(iadst_16x8_internal).pass1_end)] ++ jmp m(iadst_8x8_internal).pass1_end ++ ++.pass1_end: ++ SAVE_8ROWS coeffq+16*1, 32 ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ mov tx2q, r3 ++ jmp m(iadst_8x8_internal).pass1_end ++ ++.pass2: ++ lea tx2q, [o(m(iadst_16x8_internal).end)] ++ lea r3, [dstq+8] ++ jmp m(iadst_8x8_internal).pass2_main ++ ++.end: ++ LOAD_8ROWS coeffq+16*1, 32 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ mov dstq, r3 ++ jmp m(iadst_8x8_internal).pass2_main ++ ++ALIGN function_align ++.main: ++ mova [rsp+gprsize*2+16*0], m1 ++ mova [rsp+gprsize*2+16*1], m2 ++ mova [rsp+gprsize*2+16*2], m6 ++ ++ mova m6, [o(pd_2048)] ++ ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 ++ ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 ++ psubsw m1, m0, m4 ;t10a ++ paddsw m0, m4 ;t2a ++ psubsw m4, m7, m3 ;t11a ++ paddsw m3, m7 ;t3a ++ ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 ++ mova m2, [rsp+gprsize*2+16*0] ;in3 ++ mova m7, [rsp+gprsize*2+16*1] ;in4 ++ mova [rsp+gprsize*2+16*0], m1 ;t11 ++ mova [rsp+gprsize*2+16*1], m4 ;t10 ++ mova m1, [rsp+gprsize*2+16*2] ;in12 ++ mova [rsp+gprsize*2+16*2], m0 ;t2a ++ ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 ++ ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 ++ psubsw m0, m7, m1 ;t12a ++ paddsw m1, m7 ;t4a ++ psubsw m4, m5, m2 ;t13a ++ paddsw m5, m2 ;t5a ++ ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 ++ mova m2, [rsp+gprsize*2+16*8] ;in1 ++ mova m7, [rsp+gprsize*2+16*9] ;in14 ++ mova [rsp+gprsize*2+16*8], m4 ;t12 ++ mova [rsp+gprsize*2+16*9], m0 ;t13 ++ mova m4, [rsp+gprsize*2+16*4] ;in9 ++ mova m0, [rsp+gprsize*2+16*5] ;in6 ++ mova [rsp+gprsize*2+16*4], m1 ;t4a ++ mova [rsp+gprsize*2+16*5], m5 ;t5a ++ ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 ++ ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 ++ psubsw m1, m0, m7 ;t14a ++ paddsw m0, m7 ;t6a ++ psubsw m5, m4, m2 ;t15a ++ paddsw m4, m2 ;t7a ++ ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 ++ mova m2, [rsp+gprsize*2+16*2] ;t2a ++ mova [rsp+gprsize*2+16*2], m5 ;t14 ++ psubsw m7, m2, m0 ;t6 ++ paddsw m2, m0 ;t2 ++ psubsw m0, m3, m4 ;t7 ++ paddsw m3, m4 ;t3 ++ ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a ++ mova m4, [rsp+gprsize*2+16*7] ;in0 ++ mova m5, [rsp+gprsize*2+32*5] ;in15 ++ mova [rsp+gprsize*2+16*7], m3 ;t3 ++ mova [rsp+gprsize*2+32*5], m1 ;t15 ++ mova m1, [rsp+gprsize*2+16*6] ;in7 ++ mova m3, [rsp+gprsize*2+16*3] ;in8 ++ mova [rsp+gprsize*2+16*6], m7 ;t7a ++ mova [rsp+gprsize*2+16*3], m0 ;t6a ++ ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 ++ ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 ++ psubsw m0, m4, m3 ;t8a ++ paddsw m4, m3 ;t0a ++ psubsw m3, m5, m1 ;t9a ++ paddsw m5, m1 ;t1a ++ ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 ++ mova m1, [rsp+gprsize*2+16*4] ;t4a ++ mova m7, [rsp+gprsize*2+16*5] ;t5a ++ mova [rsp+gprsize*2+16*4], m3 ;t8 ++ mova [rsp+gprsize*2+16*5], m0 ;t9 ++ psubsw m0, m4, m1 ;t4 ++ paddsw m4, m1 ;t0 ++ psubsw m3, m5, m7 ;t5 ++ paddsw m5, m7 ;t1 ++ ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a ++ mova m7, [rsp+gprsize*2+16*3] ;t6a ++ psubsw m1, m4, m2 ;t2a ++ paddsw m4, m2 ;out0 ++ mova [rsp+gprsize*2+16*3], m4 ;out0 ++ mova m4, [rsp+gprsize*2+16*6] ;t7a ++ psubsw m2, m3, m7 ;t6 ++ paddsw m3, m7 ;-out3 ++ mova [rsp+gprsize*2+16*6], m3 ;-out3 ++ psubsw m3, m0, m4 ;t7 ++ paddsw m0, m4 ;out12 ++ mova m7, [o(pw_2896x8)] ++ psubw m4, m2, m3 ++ paddw m2, m3 ++ mova m3, [rsp+gprsize*2+16*7] ;t3 ++ pmulhrsw m4, m7 ;-out11 ++ pmulhrsw m2, m7 ;out4 ++ mova [rsp+gprsize*2+16*7], m2 ;out4 ++ psubsw m2, m5, m3 ;t3a ++ paddsw m5, m3 ;-out15 ++ psubw m3, m1, m2 ++ paddw m1, m2 ++ mova m2, [rsp+gprsize*2+32*5] ;t15 ++ pmulhrsw m3, m7 ;out8 ++ pmulhrsw m1, m7 ;-out7 ++ mova [rsp+gprsize*2+32*5 ], m1 ;-out7 ++ mova m1, [rsp+gprsize*2+16*0] ;t11 ++ mova [rsp+gprsize*2+16*11], m3 ;out8 ++ mova [rsp+gprsize*2+16*0 ], m5 ;-out15 ++ mova m3, [rsp+gprsize*2+16*1] ;t10 ++ mova [rsp+gprsize*2+16*1 ], m4 ;-out11 ++ mova m4, [rsp+gprsize*2+16*2] ;t14 ++ mova [rsp+gprsize*2+16*2 ], m0 ;out12 ++ psubsw m0, m3, m4 ;t14a ++ paddsw m3, m4 ;t10a ++ psubsw m5, m1, m2 ;t15a ++ paddsw m1, m2 ;t11a ++ ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 ++ mova m2, [rsp+gprsize*2+16*4] ;t8 ++ mova m4, [rsp+gprsize*2+16*5] ;t9 ++ mova [rsp+gprsize*2+16*4], m3 ;t10a ++ mova [rsp+gprsize*2+16*5], m1 ;t11a ++ mova m3, [rsp+gprsize*2+16*8] ;t12 ++ mova m1, [rsp+gprsize*2+16*9] ;t13 ++ mova [rsp+gprsize*2+16*8], m5 ;t14 ++ mova [rsp+gprsize*2+16*9], m0 ;t15 ++ psubsw m5, m2, m3 ;t12a ++ paddsw m2, m3 ;t8a ++ psubsw m0, m4, m1 ;t13a ++ paddsw m4, m1 ;t9a ++ ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 ++ mova m6, [rsp+gprsize*2+16*4] ;t10a ++ mova m1, [rsp+gprsize*2+16*5] ;t11a ++ psubsw m3, m2, m6 ;t10 ++ paddsw m2, m6 ;-out1 ++ paddsw m6, m4, m1 ;out14 ++ psubsw m4, m1 ;t11 ++ psubw m1, m3, m4 ++ paddw m3, m4 ++ pmulhrsw m1, m7 ;-out9 ++ pmulhrsw m3, m7 ;out6 ++ mova [rsp+gprsize*2+16*4], m2 ;-out1 ++ mova m4, [rsp+gprsize*2+16*8] ;t14 ++ mova m2, [rsp+gprsize*2+16*9] ;t15 ++ mova [rsp+gprsize*2+16*9], m3 ;out6 ++ psubsw m3, m0, m4 ;t14a ++ paddsw m0, m4 ;out2 ++ psubsw m4, m5, m2 ;t15a ++ paddsw m5, m2 ;-out13 ++ psubw m2, m3, m4 ++ paddw m3, m4 ++ mova [rsp+gprsize*2+16*5], m0 ;out2 ++ pmulhrsw m3, m7 ;-out5 ++ pmulhrsw m2, m7 ;out10 ++ mova [rsp+gprsize*2+16*8], m3 ;-out5 ++ mova m0, [rsp+gprsize*2+16*11] ;out8 ++ mova m3, [rsp+gprsize*2+16*1 ] ;-out11 ++ mova m4, [rsp+gprsize*2+16*2 ] ;out12 ++ ret ++ ++INV_TXFM_16X8_FN flipadst, dct ++INV_TXFM_16X8_FN flipadst, adst ++INV_TXFM_16X8_FN flipadst, flipadst ++INV_TXFM_16X8_FN flipadst, identity ++ ++cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ mova m7, [o(pw_2896x8)] ++ pmulhrsw m0, m7, [coeffq+16*0 ] ++ pmulhrsw m1, m7, [coeffq+16*1 ] ++ pmulhrsw m2, m7, [coeffq+16*14] ++ pmulhrsw m3, m7, [coeffq+16*15] ++ mova [rsp+gprsize+16*7], m0 ++ mova [rsp+gprsize+16*8], m1 ++ mova [rsp+gprsize+16*9], m2 ++ mova [rsp+gprsize+32*5], m3 ++ pmulhrsw m0, m7, [coeffq+16*6 ] ++ pmulhrsw m1, m7, [coeffq+16*7 ] ++ pmulhrsw m2, m7, [coeffq+16*8 ] ++ pmulhrsw m3, m7, [coeffq+16*9 ] ++ mova [rsp+gprsize+16*3], m2 ++ mova [rsp+gprsize+16*4], m3 ++ mova [rsp+gprsize+16*5], m0 ++ mova [rsp+gprsize+16*6], m1 ++ pmulhrsw m0, m7, [coeffq+16*2 ] ++ pmulhrsw m1, m7, [coeffq+16*3 ] ++ pmulhrsw m2, m7, [coeffq+16*4 ] ++ pmulhrsw m3, m7, [coeffq+16*5 ] ++ pmulhrsw m4, m7, [coeffq+16*10] ++ pmulhrsw m5, m7, [coeffq+16*11] ++ pmulhrsw m6, m7, [coeffq+16*12] ++ pmulhrsw m7, [coeffq+16*13] ++ ++ call m(iadst_16x8_internal).main ++ ++ mova m7, [rsp+gprsize+16*0] ++ SAVE_8ROWS coeffq+16*0, 32 ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ mov r3, tx2q ++ lea tx2q, [o(m(iflipadst_16x8_internal).pass1_end)] ++ jmp m(iflipadst_8x8_internal).pass1_end ++ ++.pass1_end: ++ SAVE_8ROWS coeffq+16*1, 32 ++ LOAD_8ROWS coeffq+16*0, 32 ++ mova [rsp+gprsize+16*0], m7 ++ mov tx2q, r3 ++ jmp m(iflipadst_8x8_internal).pass1_end ++ ++.pass2: ++ lea tx2q, [o(m(iflipadst_16x8_internal).end)] ++ lea r3, [dstq+8] ++ jmp m(iflipadst_8x8_internal).pass2_main ++ ++.end: ++ LOAD_8ROWS coeffq+16*1, 32 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ mov dstq, r3 ++ jmp m(iflipadst_8x8_internal).pass2_main ++ ++ ++INV_TXFM_16X8_FN identity, dct, 15 ++INV_TXFM_16X8_FN identity, adst ++INV_TXFM_16X8_FN identity, flipadst ++INV_TXFM_16X8_FN identity, identity ++ ++cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ mova m7, [o(pw_2896x8)] ++ pmulhrsw m0, m7, [coeffq+16*8 ] ++ pmulhrsw m1, m7, [coeffq+16*9 ] ++ pmulhrsw m2, m7, [coeffq+16*10] ++ pmulhrsw m3, m7, [coeffq+16*11] ++ pmulhrsw m4, m7, [coeffq+16*12] ++ pmulhrsw m5, m7, [coeffq+16*13] ++ pmulhrsw m6, m7, [coeffq+16*14] ++ pmulhrsw m7, [coeffq+16*15] ++ ++ mov r3, tx2q ++ lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)] ++ ++.pass1: ++ REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 ++ pmulhrsw m7, [o(pw_5793x4)] ++ mova [rsp+gprsize+16*0], m7 ++ ++ mova m7, [o(pw_5793x4)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ ++ jmp m(idct_8x8_internal).pass1_end ++ ++.pass1_end: ++ mova [coeffq+16*9 ], m4 ++ mova [coeffq+16*11], m5 ++ mova [coeffq+16*13], m6 ++ mova [coeffq+16*15], m7 ++ mova m4, [o(pw_2896x8)] ++ pmulhrsw m5, m4, [coeffq+16*5] ++ pmulhrsw m6, m4, [coeffq+16*6] ++ pmulhrsw m7, m4, [coeffq+16*7] ++ mova [coeffq+16*5 ], m2 ++ mova [coeffq+16*7 ], m3 ++ pmulhrsw m2, m4, [coeffq+16*2] ++ pmulhrsw m3, m4, [coeffq+16*3] ++ mova [coeffq+16*3 ], m1 ++ pmulhrsw m1, m4, [coeffq+16*1] ++ mova [coeffq+16*1 ], m0 ++ pmulhrsw m0, m4, [coeffq+16*0] ++ pmulhrsw m4, [coeffq+16*4] ++ ++ mov tx2q, r3 ++ jmp .pass1 ++ ++.pass2: ++ lea tx2q, [o(m(iidentity_16x8_internal).end)] ++ lea r3, [dstq+8] ++ jmp m(iidentity_8x8_internal).end ++ ++.end: ++ LOAD_8ROWS coeffq+16*1, 32 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ mov dstq, r3 ++ jmp m(iidentity_8x8_internal).end ++ ++ ++%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh ++ INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*12 ++%ifidn %1_%2, dct_dct ++ movd m1, [o(pw_2896x8)] ++ pmulhrsw m0, m1, [coeffq] ++ movd m2, [o(pw_8192)] ++ mov [coeffq], eobd ++ mov r2d, 8 ++ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16).end)] ++ jmp m(inv_txfm_add_dct_dct_16x4).dconly ++.end: ++ RET ++%elifidn %1_%2, dct_identity ++ mova m3, [o(pw_2896x8)] ++ pmulhrsw m2, m3, [coeffq+16*0] ++ pmulhrsw m3, [coeffq+16*1] ++ mova m0, [o(pw_8192)] ++ mova m1, [o(pw_5793x4)] ++ pshuflw m4, [o(deint_shuf)], q0000 ;pb_0_1 ++ punpcklwd m4, m4 ++ pcmpeqb m5, m5 ++ pxor m6, m6 ++ mova [coeffq+16*0], m6 ++ mova [coeffq+16*1], m6 ++ paddb m5, m5 ;pb_m2 ++ pmulhrsw m2, m0 ++ pmulhrsw m3, m0 ++ psrlw m0, 2 ;pw_2048 ++ psllw m2, 2 ++ psllw m3, 2 ++ pmulhrsw m2, m1 ++ pmulhrsw m3, m1 ++ pmulhrsw m2, m0 ++ pmulhrsw m3, m0 ++ mov r3d, 8 ++.loop: ++ mova m1, [dstq] ++ pshufb m0, m2, m4 ++ punpckhbw m7, m1, m6 ++ punpcklbw m1, m6 ++ paddw m7, m0 ++ paddw m1, m0 ++ packuswb m1, m7 ++ mova [dstq], m1 ++ mova m1, [dstq+strideq*8] ++ pshufb m0, m3, m4 ++ psubb m4, m5 ; += 2 ++ punpckhbw m7, m1, m6 ++ punpcklbw m1, m6 ++ paddw m7, m0 ++ paddw m1, m0 ++ packuswb m1, m7 ++ mova [dstq+strideq*8], m1 ++ add dstq, strideq ++ dec r3d ++ jg .loop ++ RET ++%elifidn %1_%2, identity_dct ++ mova m4, [o(pw_5793x4)] ++ mova m5, [o(pw_8192)] ++ mova m6, [o(pw_2896x8)] ++ psrlw m7, m5, 2 ;pw_2048 ++ xor eobd, eobd ++ lea tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end)] ++ lea r3, [dstq+8] ++ mov [rsp+16*0], r3 ++.main: ++ movd m0, [coeffq+32*0] ++ punpcklwd m0, [coeffq+32*1] ++ movd m2, [coeffq+32*2] ++ punpcklwd m2, [coeffq+32*3] ++ add coeffq, 32*4 ++ movd m1, [coeffq+32*0] ++ punpcklwd m1, [coeffq+32*1] ++ movd m3, [coeffq+32*2] ++ punpcklwd m3, [coeffq+32*3] ++ xor eobd, eobd ++ mov [coeffq-32*4], eobd ++ mov [coeffq-32*3], eobd ++ mov [coeffq-32*2], eobd ++ mov [coeffq-32*1], eobd ++ punpckldq m0, m2 ++ punpckldq m1, m3 ++ punpcklqdq m0, m1 ++ psllw m0, 2 ++ pmulhrsw m0, m4 ++ pmulhrsw m0, m5 ++ pmulhrsw m0, m6 ++ pmulhrsw m0, m7 ++ mov [coeffq+32*0], eobd ++ mov [coeffq+32*1], eobd ++ mov [coeffq+32*2], eobd ++ mov [coeffq+32*3], eobd ++ mov r3d, 4 ++ jmp m(inv_txfm_add_dct_dct_8x8).loop ++.end: ++ lea tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end1)] ++ add coeffq, 32*4 ++ mov dstq, [rsp+16*0] ++ jmp .main ++.end1: ++ RET ++%endif ++%endmacro ++ ++INV_TXFM_16X16_FN dct, dct, 0 ++INV_TXFM_16X16_FN dct, identity, 15 ++INV_TXFM_16X16_FN dct, adst ++INV_TXFM_16X16_FN dct, flipadst ++ ++cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ mova m0, [coeffq+16*1 ] ++ mova m1, [coeffq+16*5 ] ++ mova m2, [coeffq+16*9 ] ++ mova m3, [coeffq+16*13] ++ mova m4, [coeffq+16*17] ++ mova m5, [coeffq+16*21] ++ mova m6, [coeffq+16*25] ++ mova m7, [coeffq+16*29] ++ call m(idct_8x8_internal).main ++ mova [rsp+gprsize+16*3], m0 ++ mova [rsp+gprsize+16*4], m1 ++ mova [rsp+gprsize+16*5], m2 ++ mova [rsp+gprsize+16*6], m3 ++ mova [rsp+gprsize+16*7], m4 ++ mova [rsp+gprsize+16*8], m5 ++ mova [rsp+gprsize+16*9], m6 ++ mova m0, [coeffq+16*3 ] ++ mova m1, [coeffq+16*7 ] ++ mova m2, [coeffq+16*11] ++ mova m3, [coeffq+16*15] ++ mova m4, [coeffq+16*19] ++ mova m5, [coeffq+16*23] ++ mova m6, [coeffq+16*27] ++ mova m7, [coeffq+16*31] ++ call m(idct_16x8_internal).main ++ mov r3, tx2q ++ lea tx2q, [o(m(idct_16x16_internal).pass1_end)] ++ mova m7, [o(pw_8192)] ++ jmp m(idct_8x8_internal).pass1_end1 ++ ++.pass1_end: ++ SAVE_8ROWS coeffq+16*17, 32 ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(idct_16x16_internal).pass1_end1)] ++ mova m7, [o(pw_8192)] ++ jmp m(idct_8x8_internal).pass1_end1 ++ ++.pass1_end1: ++ SAVE_8ROWS coeffq+16*1, 32 ++ mova m0, [coeffq+16*0 ] ++ mova m1, [coeffq+16*4 ] ++ mova m2, [coeffq+16*8 ] ++ mova m3, [coeffq+16*12] ++ mova m4, [coeffq+16*16] ++ mova m5, [coeffq+16*20] ++ mova m6, [coeffq+16*24] ++ mova m7, [coeffq+16*28] ++ call m(idct_8x8_internal).main ++ mova [rsp+gprsize+16*3], m0 ++ mova [rsp+gprsize+16*4], m1 ++ mova [rsp+gprsize+16*5], m2 ++ mova [rsp+gprsize+16*6], m3 ++ mova [rsp+gprsize+16*7], m4 ++ mova [rsp+gprsize+16*8], m5 ++ mova [rsp+gprsize+16*9], m6 ++ mova m0, [coeffq+16*2 ] ++ mova m1, [coeffq+16*6 ] ++ mova m2, [coeffq+16*10] ++ mova m3, [coeffq+16*14] ++ mova m4, [coeffq+16*18] ++ mova m5, [coeffq+16*22] ++ mova m6, [coeffq+16*26] ++ mova m7, [coeffq+16*30] ++ call m(idct_16x8_internal).main ++ lea tx2q, [o(m(idct_16x16_internal).pass1_end2)] ++ mova m7, [o(pw_8192)] ++ jmp m(idct_8x8_internal).pass1_end1 ++ ++.pass1_end2: ++ SAVE_8ROWS coeffq+16*16, 32 ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ mov tx2q, r3 ++ mova m7, [o(pw_8192)] ++ jmp m(idct_8x8_internal).pass1_end1 ++ ++.pass2: ++ lea tx2q, [o(m(idct_16x16_internal).end)] ++ jmp m(idct_8x16_internal).pass2_pre ++ ++.end: ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(idct_16x16_internal).end1)] ++ mov dstq, r3 ++ lea r3, [dstq+8] ++ jmp m(idct_8x8_internal).end ++ ++.end1: ++ pxor m7, m7 ++ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++ ++ add coeffq, 32*8 ++ mov dstq, r3 ++ ++ mova m0, [coeffq+16*0 ] ++ mova m1, [coeffq+16*4 ] ++ mova m2, [coeffq+16*8 ] ++ mova m3, [coeffq+16*12] ++ mova m4, [coeffq+16*1 ] ++ mova m5, [coeffq+16*5 ] ++ mova m6, [coeffq+16*9 ] ++ mova m7, [coeffq+16*13] ++ lea tx2q, [o(m(idct_8x16_internal).end)] ++ jmp m(idct_8x16_internal).pass2_main ++ ++ ++%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 ++ mova m0, [coeffq+16*1 ] ++ mova m1, [coeffq+16*3 ] ++ mova m2, [coeffq+16*29] ++ mova m3, [coeffq+16*31] ++ mova [rsp+gprsize+16*7], m0 ++ mova [rsp+gprsize+16*8], m1 ++ mova [rsp+gprsize+16*9], m2 ++ mova [rsp+gprsize+32*5], m3 ++ mova m0, [coeffq+16*13] ++ mova m1, [coeffq+16*15] ++ mova m2, [coeffq+16*17] ++ mova m3, [coeffq+16*19] ++ mova [rsp+gprsize+16*3], m2 ++ mova [rsp+gprsize+16*4], m3 ++ mova [rsp+gprsize+16*5], m0 ++ mova [rsp+gprsize+16*6], m1 ++ mova m0, [coeffq+16*5 ] ++ mova m1, [coeffq+16*7 ] ++ mova m2, [coeffq+16*9 ] ++ mova m3, [coeffq+16*11] ++ mova m4, [coeffq+16*21] ++ mova m5, [coeffq+16*23] ++ mova m6, [coeffq+16*25] ++ mova m7, [coeffq+16*27] ++%endmacro ++ ++%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 ++ mova m0, [coeffq+16*0 ] ++ mova m1, [coeffq+16*2 ] ++ mova m2, [coeffq+16*28] ++ mova m3, [coeffq+16*30] ++ mova [rsp+gprsize+16*7], m0 ++ mova [rsp+gprsize+16*8], m1 ++ mova [rsp+gprsize+16*9], m2 ++ mova [rsp+gprsize+32*5], m3 ++ mova m0, [coeffq+16*12] ++ mova m1, [coeffq+16*14] ++ mova m2, [coeffq+16*16] ++ mova m3, [coeffq+16*18] ++ mova [rsp+gprsize+16*3], m2 ++ mova [rsp+gprsize+16*4], m3 ++ mova [rsp+gprsize+16*5], m0 ++ mova [rsp+gprsize+16*6], m1 ++ mova m0, [coeffq+16*4 ] ++ mova m1, [coeffq+16*6 ] ++ mova m2, [coeffq+16*8 ] ++ mova m3, [coeffq+16*10] ++ mova m4, [coeffq+16*20] ++ mova m5, [coeffq+16*22] ++ mova m6, [coeffq+16*24] ++ mova m7, [coeffq+16*26] ++%endmacro ++ ++INV_TXFM_16X16_FN adst, dct ++INV_TXFM_16X16_FN adst, adst ++INV_TXFM_16X16_FN adst, flipadst ++ ++cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ ITX_16X16_ADST_LOAD_ODD_COEFS ++ call m(iadst_16x8_internal).main ++ ++ mov r3, tx2q ++ lea tx2q, [o(m(iadst_16x16_internal).pass1_end)] ++ mova m7, [o(pw_8192)] ++ jmp m(iadst_8x8_internal).pass1_end1 ++ ++.pass1_end: ++ SAVE_8ROWS coeffq+16*17, 32 ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(iadst_16x16_internal).pass1_end1)] ++ mova m7, [o(pw_8192)] ++ jmp m(iadst_8x8_internal).pass1_end1 ++ ++.pass1_end1: ++ SAVE_8ROWS coeffq+16*1, 32 ++ ITX_16X16_ADST_LOAD_EVEN_COEFS ++ call m(iadst_16x8_internal).main ++ ++ lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)] ++ mova m7, [o(pw_8192)] ++ jmp m(iadst_8x8_internal).pass1_end1 ++ ++.pass1_end2: ++ SAVE_8ROWS coeffq+16*16, 32 ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ mov tx2q, r3 ++ mova m7, [o(pw_8192)] ++ jmp m(iadst_8x8_internal).pass1_end1 ++ ++.pass2: ++ lea tx2q, [o(m(iadst_16x16_internal).end)] ++ jmp m(iadst_8x16_internal).pass2_pre ++ ++.end: ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(iadst_16x16_internal).end1)] ++ mov dstq, r3 ++ lea r3, [dstq+8] ++ jmp m(iadst_8x8_internal).end ++ ++.end1: ++ pxor m7, m7 ++ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++ ++ add coeffq, 32*8 ++ mov dstq, r3 ++ ++ mova m4, [coeffq+16*0 ] ++ mova m5, [coeffq+16*2 ] ++ mova m0, [coeffq+16*4 ] ++ mova m1, [coeffq+16*6 ] ++ mova m2, [coeffq+16*8 ] ++ mova m3, [coeffq+16*10] ++ mova m6, [coeffq+16*12] ++ mova m7, [coeffq+16*14] ++ mova [rsp+gprsize+16*7], m4 ++ mova [rsp+gprsize+16*8], m5 ++ mova [rsp+gprsize+16*5], m6 ++ mova [rsp+gprsize+16*6], m7 ++ lea tx2q, [o(m(iadst_8x16_internal).end)] ++ jmp m(iadst_8x16_internal).pass2_main ++ ++ ++INV_TXFM_16X16_FN flipadst, dct ++INV_TXFM_16X16_FN flipadst, adst ++INV_TXFM_16X16_FN flipadst, flipadst ++ ++cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ ITX_16X16_ADST_LOAD_ODD_COEFS ++ call m(iadst_16x8_internal).main ++ ++ mov r3, tx2q ++ lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)] ++ mova m7, [o(pw_m8192)] ++ jmp m(iflipadst_8x8_internal).pass1_end1 ++ ++.pass1_end: ++ SAVE_8ROWS coeffq+16*1, 32 ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)] ++ mova m7, [o(pw_m8192)] ++ jmp m(iflipadst_8x8_internal).pass1_end1 ++ ++.pass1_end1: ++ SAVE_8ROWS coeffq+16*17, 32 ++ ITX_16X16_ADST_LOAD_EVEN_COEFS ++ call m(iadst_16x8_internal).main ++ ++ mova m7, [rsp+gprsize+16*0] ++ SAVE_8ROWS coeffq+16*0, 32 ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)] ++ mova m7, [o(pw_m8192)] ++ jmp m(iflipadst_8x8_internal).pass1_end1 ++ ++.pass1_end2: ++ SAVE_8ROWS coeffq+16*16, 32 ++ LOAD_8ROWS coeffq+16* 0, 32 ++ mova [rsp+gprsize+16*0], m7 ++ mov tx2q, r3 ++ mova m7, [o(pw_m8192)] ++ jmp m(iflipadst_8x8_internal).pass1_end1 ++ ++.pass2: ++ lea tx2q, [o(m(iflipadst_16x16_internal).end)] ++ lea r3, [dstq+8] ++ jmp m(iflipadst_8x16_internal).pass2_pre ++ ++.end: ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(iflipadst_16x16_internal).end1)] ++ lea dstq, [dstq+strideq*2] ++ jmp m(iflipadst_8x8_internal).end ++ ++.end1: ++ pxor m7, m7 ++ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++ ++ add coeffq, 32*8 ++ ++ mova m4, [coeffq+16*0 ] ++ mova m5, [coeffq+16*2 ] ++ mova m0, [coeffq+16*4 ] ++ mova m1, [coeffq+16*6 ] ++ mova m2, [coeffq+16*8 ] ++ mova m3, [coeffq+16*10] ++ mova m6, [coeffq+16*12] ++ mova m7, [coeffq+16*14] ++ mova [rsp+gprsize+16*7], m4 ++ mova [rsp+gprsize+16*8], m5 ++ mova [rsp+gprsize+16*5], m6 ++ mova [rsp+gprsize+16*6], m7 ++ ++ lea tx2q, [o(m(iflipadst_16x16_internal).end2)] ++ mov dstq, r3 ++ jmp m(iflipadst_8x16_internal).pass2_main ++ ++.end2: ++ ITX_8X16_LOAD_STACK_COEFS ++ mova [rsp+gprsize+16*0], m7 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ lea dstq, [dstq+strideq*2] ++ jmp m(iflipadst_8x8_internal).end ++ ++ ++INV_TXFM_16X16_FN identity, dct, 15 ++INV_TXFM_16X16_FN identity, identity ++ ++cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ++ LOAD_8ROWS coeffq+16*17, 32 ++ mov r3, tx2q ++ lea tx2q, [o(m(iidentity_16x16_internal).pass1_end)] ++ ++.pass1: ++ REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 ++ pmulhrsw m7, [o(pw_5793x4)] ++ mova [rsp+gprsize+16*0], m7 ++ ++ mova m7, [o(pw_5793x4)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ ++ mova m7, [o(pw_8192)] ++ jmp m(idct_8x8_internal).pass1_end1 ++ ++.pass1_end: ++ SAVE_8ROWS coeffq+16*17, 32 ++ LOAD_8ROWS coeffq+16* 1, 32 ++ lea tx2q, [o(m(iidentity_16x16_internal).pass1_end1)] ++ jmp .pass1 ++ ++.pass1_end1: ++ SAVE_8ROWS coeffq+16* 1, 32 ++ LOAD_8ROWS coeffq+16*16, 32 ++ lea tx2q, [o(m(iidentity_16x16_internal).pass1_end2)] ++ jmp .pass1 ++ ++.pass1_end2: ++ SAVE_8ROWS coeffq+16*16, 32 ++ LOAD_8ROWS coeffq+16* 0, 32 ++ mov tx2q, r3 ++ jmp .pass1 ++ ++.pass2: ++ lea r3, [dstq+8] ++ lea tx2q, [o(m(iidentity_16x16_internal).end1)] ++ ++.end: ++ REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 ++ pmulhrsw m7, [o(pw_5793x4)] ++ pmulhrsw m7, [o(pw_2048)] ++ mova [rsp+gprsize+16*0], m7 ++ mova m7, [o(pw_5793x4)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ mova m7, [o(pw_2048)] ++ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 ++ mova [rsp+gprsize+16*1], m6 ++ mova [rsp+gprsize+16*2], m5 ++ jmp m(idct_8x8_internal).end3 ++ ++.end1: ++ LOAD_8ROWS coeffq+16*1, 32 ++ lea tx2q, [o(m(iidentity_16x16_internal).end2)] ++ lea dstq, [dstq+strideq*2] ++ jmp .end ++ ++.end2: ++ pxor m7, m7 ++ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ++ ++ add coeffq, 32*8 ++ LOAD_8ROWS coeffq, 32 ++ lea tx2q, [o(m(iidentity_16x16_internal).end3)] ++ mov dstq, r3 ++ jmp .end ++ ++.end3: ++ LOAD_8ROWS coeffq+16*1, 32 ++ lea tx2q, [o(m(idct_8x16_internal).end1)] ++ lea dstq, [dstq+strideq*2] ++ jmp .end +diff --git third_party/dav1d/src/x86/looprestoration.asm third_party/dav1d/src/x86/looprestoration.asm +index a98e12e3e7f3..bf59a9bb0f9e 100644 +--- third_party/dav1d/src/x86/looprestoration.asm ++++ third_party/dav1d/src/x86/looprestoration.asm +@@ -42,14 +42,12 @@ pw_2048: times 2 dw 2048 + pw_16380: times 2 dw 16380 + pw_0_128: dw 0, 128 + pw_5_6: dw 5, 6 +-pw_82: times 2 dw 82 +-pw_91_5: dw 91, 5 + pd_6: dd 6 +-pd_255: dd 255 + pd_1024: dd 1024 +-pd_0x80000: dd 0x80000 ++pd_0xf0080029: dd 0xf0080029 ++pd_0xf00801c7: dd 0xf00801c7 + +-cextern sgr_x_by_xplus1 ++cextern sgr_x_by_x + + SECTION .text + +@@ -477,76 +475,65 @@ cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, + RET + + INIT_YMM avx2 +-cglobal sgr_calc_ab1, 4, 6, 14, a, b, w, h, s ++cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 +- lea r5, [sgr_x_by_xplus1] +- pxor m6, m6 +- vpbroadcastd m7, [pw_91_5] ++ lea r5, [sgr_x_by_x-0xf03] + %ifidn sd, sm +- movd xm8, sd +- vpbroadcastd m8, xm8 ++ movd xm6, sd ++ vpbroadcastd m6, xm6 + %else +- vpbroadcastd m8, sm ++ vpbroadcastd m6, sm + %endif +- vpbroadcastd m9, [pd_0x80000] +- vpbroadcastd m10, [pd_255] +- psrad m12, m9, 8 ; pd_2048 +- psrad m11, m9, 11 ; pd_256 +- pcmpeqb m13, m13 ++ vpbroadcastd m8, [pd_0xf00801c7] ++ vpbroadcastd m9, [pw_256] ++ pcmpeqb m7, m7 ++ psrld m10, m9, 13 ; pd_2048 + DEFINE_ARGS a, b, w, h, x ++ + .loop_y: + mov xq, -2 + .loop_x: +- movu xm0, [aq+xq*4+ 0] +- movu xm1, [aq+xq*4+16] +- vinserti128 m0, [aq+xq*4+ 0+(384+16)*4], 1 +- vinserti128 m1, [aq+xq*4+16+(384+16)*4], 1 +- movu xm2, [bq+xq*2] +- vinserti128 m2, [bq+xq*2+(384+16)*2], 1 +- pslld m3, m0, 3 +- pslld m4, m1, 3 +- paddd m3, m0 ; aa * 9 [first half] +- paddd m4, m1 ; aa * 9 [second half] +- punpcklwd m0, m6, m2 +- punpckhwd m2, m6, m2 +- pmaddwd m1, m0, m0 +- pmaddwd m5, m2, m2 +- pmaddwd m0, m7 +- pmaddwd m2, m7 +- psubd m3, m1 ; p = aa * 9 - bb * bb [first half] +- psubd m4, m5 ; p = aa * 9 - bb * bb [second half] +- pmulld m3, m8 +- pmulld m4, m8 +- paddd m3, m9 +- paddd m4, m9 +- psrld m3, 20 ; z [first half] +- psrld m4, 20 ; z [second half] +- pminsd m3, m10 +- pminsd m4, m10 +- mova m5, m13 +- vpgatherdd m1, [r5+m3*4], m5 ; xx [first half] +- mova m5, m13 +- vpgatherdd m3, [r5+m4*4], m5 ; xx [second half] +- psubd m5, m11, m1 +- psubd m4, m11, m3 +- packssdw m1, m3 +- pmullw m5, m7 +- pmullw m4, m7 +- pmaddwd m5, m0 +- pmaddwd m4, m2 +- paddd m5, m12 +- paddd m4, m12 +- psrad m5, 12 +- psrad m4, 12 +- movu [bq+xq*2], xm1 +- vextracti128 [bq+xq*2+(384+16)*2], m1, 1 +- movu [aq+xq*4+ 0], xm5 +- movu [aq+xq*4+16], xm4 +- vextracti128 [aq+xq*4+ 0+(384+16)*4], m5, 1 +- vextracti128 [aq+xq*4+16+(384+16)*4], m4, 1 +- ++ pmovzxwd m0, [bq+xq*2] ++ pmovzxwd m1, [bq+xq*2+(384+16)*2] ++ movu m2, [aq+xq*4] ++ movu m3, [aq+xq*4+(384+16)*4] ++ pslld m4, m2, 3 ++ pslld m5, m3, 3 ++ paddd m2, m4 ; aa * 9 ++ paddd m3, m5 ++ pmaddwd m4, m0, m0 ++ pmaddwd m5, m1, m1 ++ pmaddwd m0, m8 ++ pmaddwd m1, m8 ++ psubd m2, m4 ; p = aa * 9 - bb * bb ++ psubd m3, m5 ++ pmulld m2, m6 ++ pmulld m3, m6 ++ paddusw m2, m8 ++ paddusw m3, m8 ++ psrld m2, 20 ; z ++ psrld m3, 20 ++ mova m5, m7 ++ vpgatherdd m4, [r5+m2], m5 ; xx ++ mova m5, m7 ++ vpgatherdd m2, [r5+m3], m5 ++ psrld m4, 24 ++ psrld m2, 24 ++ pmulld m0, m4 ++ pmulld m1, m2 ++ packssdw m4, m2 ++ psubw m4, m9, m4 ++ vpermq m4, m4, q3120 ++ paddd m0, m10 ++ paddd m1, m10 ++ psrld m0, 12 ++ psrld m1, 12 ++ movu [bq+xq*2], xm4 ++ vextracti128 [bq+xq*2+(384+16)*2], m4, 1 ++ movu [aq+xq*4], m0 ++ movu [aq+xq*4+(384+16)*4], m1 + add xd, 8 + cmp xd, wd + jl .loop_x +@@ -815,7 +802,7 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, + mov ylimd, edged + and ylimd, 8 ; have_bottom + shr ylimd, 2 +- sub ylimd, 3 ; -2 if have_bottom=0, else 0 ++ sub ylimd, 3 ; -3 if have_bottom=0, else -1 + .loop_x: + lea yd, [hd+ylimd+2] + lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] +@@ -903,78 +890,67 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, + jmp .loop_y_noload + + INIT_YMM avx2 +-cglobal sgr_calc_ab2, 4, 6, 14, a, b, w, h, s ++cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s + sub aq, (384+16-1)*4 + sub bq, (384+16-1)*2 + add hd, 2 +- lea r5, [sgr_x_by_xplus1] +- pxor m6, m6 +- vpbroadcastd m7, [pw_82] ++ lea r5, [sgr_x_by_x-0xf03] + %ifidn sd, sm +- movd xm8, sd +- vpbroadcastd m8, xm8 ++ movd xm6, sd ++ vpbroadcastd m6, xm6 + %else +- vpbroadcastd m8, sm ++ vpbroadcastd m6, sm + %endif +- vpbroadcastd m9, [pd_0x80000] +- vpbroadcastd m10, [pd_255] +- psrad m12, m9, 8 ; pd_2048 +- psrad m11, m9, 11 ; pd_256 +- pcmpeqb m13, m13 ++ vpbroadcastd m8, [pd_0xf0080029] ++ vpbroadcastd m9, [pw_256] ++ pcmpeqb m7, m7 ++ psrld m10, m9, 15 ; pd_512 + DEFINE_ARGS a, b, w, h, x + .loop_y: + mov xq, -2 + .loop_x: +- movu xm0, [aq+xq*4+ 0] +- movu xm1, [aq+xq*4+16] +- vinserti128 m0, [aq+xq*4+32], 1 +- vinserti128 m1, [aq+xq*4+48], 1 +- movu m2, [bq+xq*2] +- pslld m3, m0, 5 ; aa * 32 [first half] +- pslld m4, m1, 5 ; aa * 32 [second half] +- paddd m3, m0 ; aa * 33 [first half] +- paddd m4, m1 ; aa * 33 [first half] +- pslld m0, 3 ; aa * 8 [first half] +- pslld m1, 3 ; aa * 8 [second half] +- psubd m3, m0 ; aa * 25 [first half] +- psubd m4, m1 ; aa * 25 [second half] +- punpcklwd m0, m2, m6 +- punpckhwd m2, m6 +- pmaddwd m1, m0, m0 +- pmaddwd m5, m2, m2 +- paddw m0, m0 +- paddw m2, m2 +- psubd m3, m1 ; p = aa * 25 - bb * bb [first half] +- psubd m4, m5 ; p = aa * 25 - bb * bb [second half] +- pmulld m3, m8 +- pmulld m4, m8 +- paddd m3, m9 +- paddd m4, m9 +- psrld m3, 20 ; z [first half] +- psrld m4, 20 ; z [second half] +- pminsd m3, m10 +- pminsd m4, m10 +- mova m5, m13 +- vpgatherdd m1, [r5+m3*4], m5 ; xx [first half] +- mova m5, m13 +- vpgatherdd m3, [r5+m4*4], m5 ; xx [second half] +- psubd m5, m11, m1 +- psubd m4, m11, m3 +- packssdw m1, m3 +- pmullw m5, m7 +- pmullw m4, m7 +- pmaddwd m5, m0 +- pmaddwd m4, m2 +- paddd m5, m12 +- paddd m4, m12 +- psrad m5, 12 +- psrad m4, 12 +- movu [bq+xq*2], m1 +- movu [aq+xq*4+ 0], xm5 +- movu [aq+xq*4+16], xm4 +- vextracti128 [aq+xq*4+32], m5, 1 +- vextracti128 [aq+xq*4+48], m4, 1 +- ++ pmovzxwd m0, [bq+xq*2+ 0] ++ pmovzxwd m1, [bq+xq*2+16] ++ movu m2, [aq+xq*4+ 0] ++ movu m3, [aq+xq*4+32] ++ pslld m4, m2, 3 ; aa * 8 ++ pslld m5, m3, 3 ++ paddd m2, m4 ; aa * 9 ++ paddd m3, m5 ++ paddd m4, m4 ; aa * 16 ++ paddd m5, m5 ++ paddd m2, m4 ; aa * 25 ++ paddd m3, m5 ++ pmaddwd m4, m0, m0 ++ pmaddwd m5, m1, m1 ++ psubd m2, m4 ; p = aa * 25 - bb * bb ++ psubd m3, m5 ++ pmulld m2, m6 ++ pmulld m3, m6 ++ paddusw m2, m8 ++ paddusw m3, m8 ++ psrld m2, 20 ; z ++ psrld m3, 20 ++ mova m5, m7 ++ vpgatherdd m4, [r5+m2], m5 ; xx ++ mova m5, m7 ++ vpgatherdd m2, [r5+m3], m5 ++ psrld m4, 24 ++ psrld m2, 24 ++ packssdw m3, m4, m2 ++ pmullw m4, m8 ++ pmullw m2, m8 ++ psubw m3, m9, m3 ++ vpermq m3, m3, q3120 ++ pmaddwd m0, m4 ++ pmaddwd m1, m2 ++ paddd m0, m10 ++ paddd m1, m10 ++ psrld m0, 10 ++ psrld m1, 10 ++ movu [bq+xq*2], m3 ++ movu [aq+xq*4+ 0], m0 ++ movu [aq+xq*4+32], m1 + add xd, 16 + cmp xd, wd + jl .loop_x +diff --git third_party/dav1d/src/x86/looprestoration_init_tmpl.c third_party/dav1d/src/x86/looprestoration_init_tmpl.c +index d4fb71209d29..9068008f347d 100644 +--- third_party/dav1d/src/x86/looprestoration_init_tmpl.c ++++ third_party/dav1d/src/x86/looprestoration_init_tmpl.c +@@ -32,15 +32,6 @@ + #include "common/intops.h" + #include "src/tables.h" + +-#if BITDEPTH == 8 && ARCH_X86_64 +-void dav1d_wiener_filter_h_avx2(int16_t *dst, const pixel (*left)[4], +- const pixel *src, ptrdiff_t stride, +- const int16_t fh[7], const intptr_t w, +- int h, enum LrEdgeFlags edges); +-void dav1d_wiener_filter_v_avx2(pixel *dst, ptrdiff_t stride, +- const int16_t *mid, int w, int h, +- const int16_t fv[7], enum LrEdgeFlags edges); +- + // Future potential optimizations: + // - special chroma versions which don't filter [0]/[6]; + // - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top +@@ -50,159 +41,186 @@ void dav1d_wiener_filter_v_avx2(pixel *dst, ptrdiff_t stride, + // to run 32 (like filter_h_avx2), and then all vpermqs can go; + // - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2, + // since then the have_left condition can be inlined; +-// - consider having the wrapper (wiener_filter_avx2) also in hand-written ++// - consider having the wrapper (wiener_filter_${ext}) also in hand-written + // assembly, so the setup overhead is minimized. + +-static void wiener_filter_avx2(pixel *const dst, const ptrdiff_t dst_stride, +- const pixel (*const left)[4], +- const pixel *lpf, const ptrdiff_t lpf_stride, +- const int w, const int h, const int16_t fh[7], +- const int16_t fv[7], const enum LrEdgeFlags edges) +-{ +- ALIGN_STK_32(int16_t, mid, 68 * 384,); +- +- // horizontal filter +- dav1d_wiener_filter_h_avx2(&mid[2 * 384], left, dst, dst_stride, +- fh, w, h, edges); +- if (edges & LR_HAVE_TOP) +- dav1d_wiener_filter_h_avx2(mid, NULL, lpf, lpf_stride, +- fh, w, 2, edges); +- if (edges & LR_HAVE_BOTTOM) +- dav1d_wiener_filter_h_avx2(&mid[(2 + h) * 384], NULL, +- lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride, +- fh, w, 2, edges); +- +- dav1d_wiener_filter_v_avx2(dst, dst_stride, &mid[2*384], w, h, fv, edges); +-} +- +-void dav1d_sgr_box3_h_avx2(int32_t *sumsq, int16_t *sum, +- const pixel (*left)[4], +- const pixel *src, const ptrdiff_t stride, +- const int w, const int h, +- const enum LrEdgeFlags edges); +-void dav1d_sgr_box3_v_avx2(int32_t *sumsq, int16_t *sum, +- const int w, const int h, +- const enum LrEdgeFlags edges); +-void dav1d_sgr_calc_ab1_avx2(int32_t *a, int16_t *b, +- const int w, const int h, const int strength); +-void dav1d_sgr_finish_filter1_avx2(coef *tmp, +- const pixel *src, const ptrdiff_t stride, +- const int32_t *a, const int16_t *b, +- const int w, const int h); +- +-// filter with a 3x3 box (radius=1) +-static void dav1d_sgr_filter1_avx2(coef *tmp, +- const pixel *src, const ptrdiff_t stride, +- const pixel (*left)[4], +- const pixel *lpf, const ptrdiff_t lpf_stride, +- const int w, const int h, const int strength, +- const enum LrEdgeFlags edges) +-{ +- ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); +- int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; +- ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); +- int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; +- +- dav1d_sgr_box3_h_avx2(sumsq, sum, left, src, stride, w, h, edges); +- if (edges & LR_HAVE_TOP) +- dav1d_sgr_box3_h_avx2(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], +- NULL, lpf, lpf_stride, w, 2, edges); +- +- if (edges & LR_HAVE_BOTTOM) +- dav1d_sgr_box3_h_avx2(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], +- NULL, lpf + 6 * PXSTRIDE(lpf_stride), +- lpf_stride, w, 2, edges); +- +- dav1d_sgr_box3_v_avx2(sumsq, sum, w, h, edges); +- dav1d_sgr_calc_ab1_avx2(a, b, w, h, strength); +- dav1d_sgr_finish_filter1_avx2(tmp, src, stride, a, b, w, h); ++#define WIENER_FILTER(ext) \ ++\ ++void dav1d_wiener_filter_h_##ext(int16_t *dst, const pixel (*left)[4], \ ++ const pixel *src, ptrdiff_t stride, \ ++ const int16_t fh[7], const intptr_t w, \ ++ int h, enum LrEdgeFlags edges); \ ++void dav1d_wiener_filter_v_##ext(pixel *dst, ptrdiff_t stride, \ ++ const int16_t *mid, int w, int h, \ ++ const int16_t fv[7], enum LrEdgeFlags edges); \ ++\ ++static void wiener_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ ++ const pixel (*const left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int16_t fh[7], \ ++ const int16_t fv[7], const enum LrEdgeFlags edges) \ ++{ \ ++ ALIGN_STK_32(int16_t, mid, 68 * 384,); \ ++\ ++ /* horizontal filter */ \ ++ dav1d_wiener_filter_h_##ext(&mid[2 * 384], left, dst, dst_stride, \ ++ fh, w, h, edges); \ ++ if (edges & LR_HAVE_TOP) \ ++ dav1d_wiener_filter_h_##ext(mid, NULL, lpf, lpf_stride, \ ++ fh, w, 2, edges); \ ++ if (edges & LR_HAVE_BOTTOM) \ ++ dav1d_wiener_filter_h_##ext(&mid[(2 + h) * 384], NULL, \ ++ lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride, \ ++ fh, w, 2, edges); \ ++\ ++ dav1d_wiener_filter_v_##ext(dst, dst_stride, &mid[2*384], w, h, fv, edges); \ + } + +-void dav1d_sgr_box5_h_avx2(int32_t *sumsq, int16_t *sum, +- const pixel (*left)[4], +- const pixel *src, const ptrdiff_t stride, +- const int w, const int h, +- const enum LrEdgeFlags edges); +-void dav1d_sgr_box5_v_avx2(int32_t *sumsq, int16_t *sum, +- const int w, const int h, +- const enum LrEdgeFlags edges); +-void dav1d_sgr_calc_ab2_avx2(int32_t *a, int16_t *b, +- const int w, const int h, const int strength); +-void dav1d_sgr_finish_filter2_avx2(coef *tmp, +- const pixel *src, const ptrdiff_t stride, +- const int32_t *a, const int16_t *b, +- const int w, const int h); +- +-// filter with a 5x5 box (radius=2) +-static void dav1d_sgr_filter2_avx2(coef *tmp, +- const pixel *src, const ptrdiff_t stride, +- const pixel (*left)[4], +- const pixel *lpf, const ptrdiff_t lpf_stride, +- const int w, const int h, const int strength, +- const enum LrEdgeFlags edges) +-{ +- ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); +- int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; +- ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); +- int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; +- +- dav1d_sgr_box5_h_avx2(sumsq, sum, left, src, stride, w, h, edges); +- if (edges & LR_HAVE_TOP) +- dav1d_sgr_box5_h_avx2(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], +- NULL, lpf, lpf_stride, w, 2, edges); +- +- if (edges & LR_HAVE_BOTTOM) +- dav1d_sgr_box5_h_avx2(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], +- NULL, lpf + 6 * PXSTRIDE(lpf_stride), +- lpf_stride, w, 2, edges); +- +- dav1d_sgr_box5_v_avx2(sumsq, sum, w, h, edges); +- dav1d_sgr_calc_ab2_avx2(a, b, w, h, strength); +- dav1d_sgr_finish_filter2_avx2(tmp, src, stride, a, b, w, h); ++#define SGR_FILTER(ext) \ ++\ ++void dav1d_sgr_box3_h_##ext(int32_t *sumsq, int16_t *sum, \ ++ const pixel (*left)[4], \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void dav1d_sgr_box3_v_##ext(int32_t *sumsq, int16_t *sum, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void dav1d_sgr_calc_ab1_##ext(int32_t *a, int16_t *b, \ ++ const int w, const int h, const int strength); \ ++void dav1d_sgr_finish_filter1_##ext(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int32_t *a, const int16_t *b, \ ++ const int w, const int h); \ ++\ ++/* filter with a 3x3 box (radius=1) */ \ ++static void dav1d_sgr_filter1_##ext(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const pixel (*left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int strength, \ ++ const enum LrEdgeFlags edges) \ ++{ \ ++ ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ ++ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ ++ ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ ++ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ ++\ ++ dav1d_sgr_box3_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ ++ if (edges & LR_HAVE_TOP) \ ++ dav1d_sgr_box3_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ ++ NULL, lpf, lpf_stride, w, 2, edges); \ ++\ ++ if (edges & LR_HAVE_BOTTOM) \ ++ dav1d_sgr_box3_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ ++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ ++ lpf_stride, w, 2, edges); \ ++\ ++ dav1d_sgr_box3_v_##ext(sumsq, sum, w, h, edges); \ ++ dav1d_sgr_calc_ab1_##ext(a, b, w, h, strength); \ ++ dav1d_sgr_finish_filter1_##ext(tmp, src, stride, a, b, w, h); \ ++} \ ++\ ++void dav1d_sgr_box5_h_##ext(int32_t *sumsq, int16_t *sum, \ ++ const pixel (*left)[4], \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void dav1d_sgr_box5_v_##ext(int32_t *sumsq, int16_t *sum, \ ++ const int w, const int h, \ ++ const enum LrEdgeFlags edges); \ ++void dav1d_sgr_calc_ab2_##ext(int32_t *a, int16_t *b, \ ++ const int w, const int h, const int strength); \ ++void dav1d_sgr_finish_filter2_##ext(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const int32_t *a, const int16_t *b, \ ++ const int w, const int h); \ ++\ ++/* filter with a 5x5 box (radius=2) */ \ ++static void dav1d_sgr_filter2_##ext(coef *tmp, \ ++ const pixel *src, const ptrdiff_t stride, \ ++ const pixel (*left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int strength, \ ++ const enum LrEdgeFlags edges) \ ++{ \ ++ ALIGN_STK_32(int32_t, sumsq_mem, (384 + 16) * 68 + 8,); \ ++ int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq; \ ++ ALIGN_STK_32(int16_t, sum_mem, (384 + 16) * 68 + 16,); \ ++ int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum; \ ++\ ++ dav1d_sgr_box5_h_##ext(sumsq, sum, left, src, stride, w, h, edges); \ ++ if (edges & LR_HAVE_TOP) \ ++ dav1d_sgr_box5_h_##ext(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)], \ ++ NULL, lpf, lpf_stride, w, 2, edges); \ ++\ ++ if (edges & LR_HAVE_BOTTOM) \ ++ dav1d_sgr_box5_h_##ext(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)], \ ++ NULL, lpf + 6 * PXSTRIDE(lpf_stride), \ ++ lpf_stride, w, 2, edges); \ ++\ ++ dav1d_sgr_box5_v_##ext(sumsq, sum, w, h, edges); \ ++ dav1d_sgr_calc_ab2_##ext(a, b, w, h, strength); \ ++ dav1d_sgr_finish_filter2_##ext(tmp, src, stride, a, b, w, h); \ ++} \ ++\ ++void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \ ++ const coef *t1, const int w, const int h, \ ++ const int wt); \ ++void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \ ++ const coef *t1, const coef *t2, \ ++ const int w, const int h, \ ++ const int16_t wt[2]); \ ++\ ++static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ ++ const pixel (*const left)[4], \ ++ const pixel *lpf, const ptrdiff_t lpf_stride, \ ++ const int w, const int h, const int sgr_idx, \ ++ const int16_t sgr_wt[7], const enum LrEdgeFlags edges) \ ++{ \ ++ if (!dav1d_sgr_params[sgr_idx][0]) { \ ++ ALIGN_STK_32(coef, tmp, 64 * 384,); \ ++ dav1d_sgr_filter1_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, dav1d_sgr_params[sgr_idx][3], edges); \ ++ dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); \ ++ } else if (!dav1d_sgr_params[sgr_idx][1]) { \ ++ ALIGN_STK_32(coef, tmp, 64 * 384,); \ ++ dav1d_sgr_filter2_##ext(tmp, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, dav1d_sgr_params[sgr_idx][2], edges); \ ++ dav1d_sgr_weighted1_##ext(dst, dst_stride, tmp, w, h, sgr_wt[0]); \ ++ } else { \ ++ ALIGN_STK_32(coef, tmp1, 64 * 384,); \ ++ ALIGN_STK_32(coef, tmp2, 64 * 384,); \ ++ dav1d_sgr_filter2_##ext(tmp1, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, dav1d_sgr_params[sgr_idx][2], edges); \ ++ dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ ++ w, h, dav1d_sgr_params[sgr_idx][3], edges); \ ++ const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] }; \ ++ dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \ ++ } \ + } + +-void dav1d_sgr_weighted1_avx2(pixel *dst, const ptrdiff_t stride, +- const coef *t1, const int w, const int h, +- const int wt); +-void dav1d_sgr_weighted2_avx2(pixel *dst, const ptrdiff_t stride, +- const coef *t1, const coef *t2, +- const int w, const int h, +- const int16_t wt[2]); ++#define DEF_LR_FILTERS(ext) \ ++WIENER_FILTER(ext) \ ++SGR_FILTER(ext) + +-static void sgr_filter_avx2(pixel *const dst, const ptrdiff_t dst_stride, +- const pixel (*const left)[4], +- const pixel *lpf, const ptrdiff_t lpf_stride, +- const int w, const int h, const int sgr_idx, +- const int16_t sgr_wt[7], const enum LrEdgeFlags edges) +-{ +- if (!dav1d_sgr_params[sgr_idx][0]) { +- ALIGN_STK_32(coef, tmp, 64 * 384,); +- dav1d_sgr_filter1_avx2(tmp, dst, dst_stride, left, lpf, lpf_stride, +- w, h, dav1d_sgr_params[sgr_idx][3], edges); +- dav1d_sgr_weighted1_avx2(dst, dst_stride, tmp, w, h, (1 << 7) - sgr_wt[1]); +- } else if (!dav1d_sgr_params[sgr_idx][1]) { +- ALIGN_STK_32(coef, tmp, 64 * 384,); +- dav1d_sgr_filter2_avx2(tmp, dst, dst_stride, left, lpf, lpf_stride, +- w, h, dav1d_sgr_params[sgr_idx][2], edges); +- dav1d_sgr_weighted1_avx2(dst, dst_stride, tmp, w, h, sgr_wt[0]); +- } else { +- ALIGN_STK_32(coef, tmp1, 64 * 384,); +- ALIGN_STK_32(coef, tmp2, 64 * 384,); +- dav1d_sgr_filter2_avx2(tmp1, dst, dst_stride, left, lpf, lpf_stride, +- w, h, dav1d_sgr_params[sgr_idx][2], edges); +- dav1d_sgr_filter1_avx2(tmp2, dst, dst_stride, left, lpf, lpf_stride, +- w, h, dav1d_sgr_params[sgr_idx][3], edges); +- const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] }; +- dav1d_sgr_weighted2_avx2(dst, dst_stride, tmp1, tmp2, w, h, wt); +- } +-} ++#if BITDEPTH == 8 ++DEF_LR_FILTERS(ssse3) ++# if ARCH_X86_64 ++DEF_LR_FILTERS(avx2) ++# endif + #endif + + void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + +- if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; ++ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; ++#if BITDEPTH == 8 ++ c->wiener = wiener_filter_ssse3; ++ c->selfguided = sgr_filter_ssse3; ++#endif + ++ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + #if BITDEPTH == 8 && ARCH_X86_64 + c->wiener = wiener_filter_avx2; + c->selfguided = sgr_filter_avx2; +diff --git third_party/dav1d/src/x86/looprestoration_ssse3.asm third_party/dav1d/src/x86/looprestoration_ssse3.asm +new file mode 100644 +index 000000000000..e9c83cc05963 +--- /dev/null ++++ third_party/dav1d/src/x86/looprestoration_ssse3.asm +@@ -0,0 +1,1826 @@ ++; Copyright © 2018, VideoLAN and dav1d authors ++; Copyright © 2018, Two Orioles, LLC ++; Copyright © 2018, VideoLabs ++; All rights reserved. ++; ++; Redistribution and use in source and binary forms, with or without ++; modification, are permitted provided that the following conditions are met: ++; ++; 1. Redistributions of source code must retain the above copyright notice, this ++; list of conditions and the following disclaimer. ++; ++; 2. Redistributions in binary form must reproduce the above copyright notice, ++; this list of conditions and the following disclaimer in the documentation ++; and/or other materials provided with the distribution. ++; ++; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ++; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++%include "config.asm" ++%include "ext/x86/x86inc.asm" ++ ++SECTION_RODATA 16 ++ ++pb_right_ext_mask: times 16 db 0xff ++ times 16 db 0 ++pb_14x0_1_2: times 14 db 0 ++ db 1, 2 ++pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 ++ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 ++pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 ++pb_0: times 16 db 0 ++pb_2: times 16 db 2 ++pb_3: times 16 db 3 ++pb_4: times 16 db 4 ++pb_15: times 16 db 15 ++pb_0_1: times 8 db 0, 1 ++pb_6_7: times 8 db 6, 7 ++pb_14_15: times 8 db 14, 15 ++pb_0_1_2_3: times 4 db 0, 1, 2, 3 ++pb_4_5_6_7: times 4 db 4, 5, 6, 7 ++pw_1: times 8 dw 1 ++pw_16: times 8 dw 16 ++pw_128: times 8 dw 128 ++pw_255: times 8 dw 255 ++pw_256: times 8 dw 256 ++pw_2048: times 8 dw 2048 ++pw_16380: times 8 dw 16380 ++pw_5_6: times 4 dw 5, 6 ++pw_0_128: times 4 dw 0, 128 ++pd_1024: times 4 dd 1024 ++%if ARCH_X86_32 ++pd_256: times 4 dd 256 ++pd_512: times 4 dd 512 ++pd_2048: times 4 dd 2048 ++%endif ++pd_0xF0080029: times 4 dd 0xF0080029 ++pd_0xF00801C7: times 4 dd 0XF00801C7 ++ ++cextern sgr_x_by_x ++ ++SECTION .text ++ ++%if ARCH_X86_32 ++ %define PIC_base_offset $$ ++ ++ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg ++ %assign pic_reg_stk_off 4 ++ %xdefine PIC_reg %1 ++ %if %2 == 1 ++ mov [esp], %1 ++ %endif ++ LEA PIC_reg, PIC_base_offset ++ %if %3 == 1 ++ XCHG_PIC_REG ++ %endif ++ %endmacro ++ ++ %macro XCHG_PIC_REG 0 ++ mov [esp+pic_reg_stk_off], PIC_reg ++ %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 ++ mov PIC_reg, [esp+pic_reg_stk_off] ++ %endmacro ++ ++ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) ++ ++%else ++ %macro XCHG_PIC_REG 0 ++ %endmacro ++ ++ %define PIC_sym(sym) (sym) ++%endif ++ ++;;;;;;;;;;;;;;;;;;;;;; ++;; wiener ;; ++;;;;;;;;;;;;;;;;;;;;;; ++ ++INIT_XMM ssse3 ++%if ARCH_X86_64 ++cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge ++ mov edged, edgem ++ movifnidn wd, wm ++ mov hd, hm ++ movq m15, [fhq] ++ pshufb m12, m15, [pb_6_7] ++ pshufb m13, m15, [pb_4] ++ pshufb m14, m15, [pb_2] ++ pshufb m15, m15, [pb_0] ++ mova m11, [pw_2048] ++ mova m10, [pw_16380] ++ lea r11, [pb_right_ext_mask] ++ ++ DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim ++%else ++cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge ++ mov wd, edgem ++ mov [esp+12], wd ++ mov wd, wm ++ mov hd, hm ++ SETUP_PIC hd ++ movq m0, [fhq] ++ pshufb m3, m0, [PIC_sym(pb_6_7)] ++ pshufb m2, m0, [PIC_sym(pb_4)] ++ pshufb m1, m0, [PIC_sym(pb_2)] ++ pshufb m0, m0, [PIC_sym(pb_0)] ++ ++ DEFINE_ARGS dst, left, src, stride, x, w, h, edge ++ ++ %define srcptrq srcq ++ %define dstptrq dstq ++ %define hd dword [esp] ++ %define edged dword [esp+12] ++ %define xlimd dword [esp+16] ++ ++ %define m10 [PIC_sym(pw_16380)] ++ %define m11 [PIC_sym(pw_2048)] ++ %define m12 [esp+0x14] ++ %define m13 [esp+0x24] ++ %define m14 [esp+0x34] ++ %define m15 [esp+0x44] ++ ++ mova m15, m0 ++ mova m14, m1 ++ mova m13, m2 ++ mova m12, m3 ++%endif ++ ++ ; if (edge & has_right) align_w_to_16 ++ ; else w -= 3, and use that as limit in x loop ++ test edged, 2 ; has_right ++ jnz .align ++ mov xlimd, -3 ++ jmp .loop ++.align: ++ add wd, 15 ++ and wd, ~15 ++%if ARCH_X86_64 ++ xor xlimd, xlimd ++%else ++ mov xlimd, 0 ++%endif ++ ++ ; main y loop for vertical filter ++.loop: ++%if ARCH_X86_64 ++ mov srcptrq, srcq ++ mov dstptrq, dstq ++ lea xd, [wq+xlimq] ++%else ++ mov [esp+8], srcq ++ mov [esp+4], dstq ++ mov xd, xlimd ++ add xd, wd ++%endif ++ ++ ; load left edge pixels ++ test edged, 1 ; have_left ++ jz .emu_left ++ test leftq, leftq ; left == NULL for the edge-extended bottom/top ++ jz .load_left_combined ++ movd m0, [leftq] ++ movd m1, [srcq] ++ punpckldq m0, m1 ++ pslldq m0, 9 ++ add leftq, 4 ++ jmp .left_load_done ++.load_left_combined: ++ movq m0, [srcq-3] ++ pslldq m0, 10 ++ jmp .left_load_done ++.emu_left: ++ movd m0, [srcq] ++ pshufb m0, [PIC_sym(pb_14x0_1_2)] ++ ++ ; load right edge pixels ++.left_load_done: ++ cmp xd, 16 ++ jg .main_load ++ test xd, xd ++ jg .load_and_splat ++ je .splat_right ++ ++ ; for very small images (w=[1-2]), edge-extend the original cache, ++ ; ugly, but only runs in very odd cases ++ add wd, wd ++%if ARCH_X86_64 ++ pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] ++%else ++ pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16] ++%endif ++ shr wd, 1 ++ ++ ; main x loop, mostly this starts in .main_load ++.splat_right: ++ ; no need to load new pixels, just extend them from the (possibly previously ++ ; extended) previous load into m0 ++ pshufb m1, m0, [PIC_sym(pb_15)] ++ jmp .main_loop ++.load_and_splat: ++ ; load new pixels and extend edge for right-most ++ movu m1, [srcptrq+3] ++%if ARCH_X86_64 ++ sub r11, xq ++ movu m2, [r11+16] ++ add r11, xq ++%else ++ sub PIC_reg, xd ++ movu m2, [PIC_sym(pb_right_ext_mask)+16] ++ add PIC_reg, xd ++%endif ++ movd m3, [srcptrq+2+xq] ++ pshufb m3, [PIC_sym(pb_0)] ++ pand m1, m2 ++ pxor m2, [PIC_sym(pb_right_ext_mask)] ++ pand m3, m2 ++ pxor m2, [PIC_sym(pb_right_ext_mask)] ++ por m1, m3 ++ jmp .main_loop ++.main_load: ++ ; load subsequent line ++ movu m1, [srcptrq+3] ++.main_loop: ++ palignr m2, m1, m0, 10 ++ palignr m3, m1, m0, 11 ++ palignr m4, m1, m0, 12 ++ palignr m5, m1, m0, 13 ++ palignr m6, m1, m0, 14 ++ palignr m7, m1, m0, 15 ++ ++%if ARCH_X86_32 ++ mova [esp+0x54], m1 ++ %define m8 m1 ++%endif ++ punpcklbw m0, m2, m1 ++ punpckhbw m2, m1 ++ punpcklbw m8, m3, m7 ++ punpckhbw m3, m7 ++ punpcklbw m7, m4, m6 ++ punpckhbw m4, m6 ++ pmaddubsw m0, m15 ++ pmaddubsw m2, m15 ++ pmaddubsw m8, m14 ++ pmaddubsw m3, m14 ++ pmaddubsw m7, m13 ++ pmaddubsw m4, m13 ++ paddw m0, m8 ++ paddw m2, m3 ++ pxor m3, m3 ++ punpcklbw m6, m5, m3 ++ punpckhbw m5, m3 ++ psllw m8, m6, 7 ++ psllw m3, m5, 7 ++ psubw m8, m10 ++ psubw m3, m10 ++ pmullw m6, m12 ++ pmullw m5, m12 ++ paddw m0, m7 ++ paddw m2, m4 ++ paddw m0, m6 ++ paddw m2, m5 ++ paddsw m0, m8 ++ paddsw m2, m3 ++ psraw m0, 3 ++ psraw m2, 3 ++ paddw m0, m11 ++ paddw m2, m11 ++ mova [dstptrq+ 0], m0 ++ mova [dstptrq+16], m2 ++ ++%if ARCH_X86_64 ++ mova m0, m1 ++%else ++ mova m0, [esp+0x54] ++%endif ++ add srcptrq, 16 ++ add dstptrq, 32 ++ sub xd, 16 ++ cmp xd, 16 ++ jg .main_load ++ test xd, xd ++ jg .load_and_splat ++ cmp xd, xlimd ++ jg .splat_right ++ ++%if ARCH_X86_32 ++ mov srcq, [esp+8] ++ mov dstq, [esp+4] ++%endif ++ add srcq, strideq ++ add dstq, 384*2 ++ dec hd ++ jg .loop ++ RET ++ ++%if ARCH_X86_64 ++cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge ++ mov edged, edgem ++ movifnidn fvq, fvmp ++ movifnidn hd, hm ++ movq m15, [fvq] ++ pshufb m14, m15, [pb_4_5_6_7] ++ pshufb m15, m15, [pb_0_1_2_3] ++ paddw m14, [pw_0_128] ++ movd m12, [pd_1024] ++ pshufd m12, m12, 0 ++ ++ DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr ++ ++ mov ylimd, edged ++ and ylimd, 8 ; have_bottom ++ shr ylimd, 2 ++ sub ylimd, 3 ++%else ++cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge ++ %define ylimd [esp+12] ++ ++ mov r5d, edgem ++ and r5d, 8 ++ shr r5d, 2 ++ sub r5d, 3 ++ mov ylimd, r5d ++ mov fvq, fvmp ++ mov edged, edgem ++ ++ SETUP_PIC edged ++ ++ movq m0, [fvq] ++ pshufb m1, m0, [PIC_sym(pb_4_5_6_7)] ++ pshufb m0, m0, [PIC_sym(pb_0_1_2_3)] ++ paddw m1, [PIC_sym(pw_0_128)] ++ mova [esp+0x50], m0 ++ mova [esp+0x40], m1 ++ ++ DEFINE_ARGS dst, stride, mid, w, h, y, edge ++ %define mptrq midq ++ %define dstptrq dstq ++ %define edged dword [esp] ++%endif ++ ++ ; main x loop for vertical filter, does one column of 16 pixels ++.loop_x: ++ mova m3, [midq] ; middle line ++ ++ ; load top pixels ++ test edged, 4 ; have_top ++ jz .emu_top ++ mova m0, [midq-384*4] ++ mova m2, [midq-384*2] ++ mova m1, m0 ++ jmp .load_bottom_pixels ++.emu_top: ++ mova m0, m3 ++ mova m1, m3 ++ mova m2, m3 ++ ++ ; load bottom pixels ++.load_bottom_pixels: ++ mov yd, hd ++%if ARCH_X86_64 ++ mov mptrq, midq ++ mov dstptrq, dstq ++ add yd, ylimd ++%else ++ mov [esp+8], midq ++ mov [esp+4], dstq ++ add yd, ylimd ++%endif ++ jg .load_threelines ++ ++ ; the remainder here is somewhat messy but only runs in very weird ++ ; circumstances at the bottom of the image in very small blocks (h=[1-3]), ++ ; so performance is not terribly important here... ++ je .load_twolines ++ cmp yd, -1 ++ je .load_oneline ++ ; h == 1 case ++ mova m5, m3 ++ mova m4, m3 ++ mova m6, m3 ++ jmp .loop ++.load_oneline: ++ ; h == 2 case ++ mova m4, [midq+384*2] ++ mova m5, m4 ++ mova m6, m4 ++ jmp .loop ++.load_twolines: ++ ; h == 3 case ++ mova m4, [midq+384*2] ++ mova m5, [midq+384*4] ++ mova m6, m5 ++ jmp .loop ++.load_threelines: ++ ; h > 3 case ++ mova m4, [midq+384*2] ++ mova m5, [midq+384*4] ++ ; third line loaded in main loop below ++ ++ ; main y loop for vertical filter ++.loop_load: ++ ; load one line into m6. if that pixel is no longer available, do ++ ; nothing, since m6 still has the data from the previous line in it. We ++ ; try to structure the loop so that the common case is evaluated fastest ++ mova m6, [mptrq+384*6] ++.loop: ++%if ARCH_X86_64 ++ paddw m7, m0, m6 ++ paddw m8, m1, m5 ++ paddw m9, m2, m4 ++ punpcklwd m10, m7, m8 ++ punpckhwd m7, m8 ++ punpcklwd m11, m9, m3 ++ punpckhwd m9, m3 ++ pmaddwd m10, m15 ++ pmaddwd m7, m15 ++ pmaddwd m11, m14 ++ pmaddwd m9, m14 ++ paddd m10, m12 ++ paddd m7, m12 ++ paddd m10, m11 ++ paddd m7, m9 ++ psrad m10, 11 ++ psrad m7, 11 ++ packssdw m10, m7 ++ packuswb m10, m10 ++ movq [dstptrq], m10 ++%else ++ mova [esp+0x30], m1 ++ mova [esp+0x20], m2 ++ mova [esp+0x10], m3 ++ paddw m0, m6 ++ paddw m1, m5 ++ paddw m2, m4 ++ punpcklwd m7, m2, m3 ++ punpckhwd m2, m3 ++ punpcklwd m3, m0, m1 ++ punpckhwd m0, m1 ++ mova m1, [esp+0x50] ++ pmaddwd m3, m1 ++ pmaddwd m0, m1 ++ mova m1, [esp+0x40] ++ pmaddwd m7, m1 ++ pmaddwd m2, m1 ++ paddd m3, [PIC_sym(pd_1024)] ++ paddd m0, [PIC_sym(pd_1024)] ++ paddd m3, m7 ++ paddd m0, m2 ++ psrad m3, 11 ++ psrad m0, 11 ++ packssdw m3, m0 ++ packuswb m3, m3 ++ movq [dstq], m3 ++ mova m1, [esp+0x30] ++ mova m2, [esp+0x20] ++ mova m3, [esp+0x10] ++%endif ++ ; shift pixels one position ++ mova m0, m1 ++ mova m1, m2 ++ mova m2, m3 ++ mova m3, m4 ++ mova m4, m5 ++ mova m5, m6 ++ add dstptrq, strideq ++ add mptrq, 384*2 ++ dec yd ++ jg .loop_load ++ ; for the bottom pixels, continue using m6 (as extended edge) ++ cmp yd, ylimd ++ jg .loop ++ ++%if ARCH_X86_32 ++ mov midq, [esp+8] ++ mov dstq, [esp+4] ++%endif ++ add dstq, 8 ++ add midq, 16 ++ sub wd, 8 ++ jg .loop_x ++ RET ++ ++;;;;;;;;;;;;;;;;;;;;;;;;;; ++;; self-guided ;; ++;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++%macro MULLD 2 ++ pmulhuw m5, %1, %2 ++ pmullw %1, %2 ++ pslld m5, 16 ++ paddd %1, m5 ++%endmacro ++ ++%macro GATHERDD 2 ++ mova m5, m7 ++ movd r6d, %2 ++ %if ARCH_X86_64 ++ movd %1, [r5+r6] ++ pextrw r6d, %2, 2 ++ pinsrw m5, [r5+r6+2], 3 ++ pextrw r6d, %2, 4 ++ pinsrw %1, [r5+r6+2], 5 ++ pextrw r6d, %2, 6 ++ pinsrw m5, [r5+r6+2], 7 ++ %else ++ movd %1, [PIC_sym(sgr_x_by_x-0xF03)+r6] ++ pextrw r6d, %2, 2 ++ pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 3 ++ pextrw r6d, %2, 4 ++ pinsrw %1, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 5 ++ pextrw r6d, %2, 6 ++ pinsrw m5, [PIC_sym(sgr_x_by_x-0xF03)+r6+2], 7 ++ %endif ++ por %1, m5 ++%endmacro ++ ++%if ARCH_X86_64 ++cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim ++ mov xlimd, edgem ++ movifnidn xd, xm ++ mov hd, hm ++ mov edged, xlimd ++ and xlimd, 2 ; have_right ++ add xd, xlimd ++ xor xlimd, 2 ; 2*!have_right ++%else ++cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim ++ %define wq r0m ++ %define xlimd r1m ++ %define hd hmp ++ %define edged edgemp ++ ++ mov r6, edgem ++ and r6, 2 ; have_right ++ add xd, r6 ++ xor r6, 2 ; 2*!have_right ++ mov xlimd, r6 ++ SETUP_PIC r6, 0 ++%endif ++ ++ jnz .no_right ++ add xd, 7 ++ and xd, ~7 ++.no_right: ++ pxor m1, m1 ++ lea srcq, [srcq+xq] ++ lea sumq, [sumq+xq*2-2] ++ lea sumsqq, [sumsqq+xq*4-4] ++ neg xq ++ mov wq, xq ++%if ARCH_X86_64 ++ lea r10, [pb_right_ext_mask+16] ++%endif ++.loop_y: ++ mov xq, wq ++ ++ ; load left ++ test edged, 1 ; have_left ++ jz .no_left ++ test leftq, leftq ++ jz .load_left_from_main ++ movd m0, [leftq] ++ pslldq m0, 12 ++ add leftq, 4 ++ jmp .expand_x ++.no_left: ++ movd m0, [srcq+xq] ++ pshufb m0, [PIC_sym(pb_0)] ++ jmp .expand_x ++.load_left_from_main: ++ movd m0, [srcq+xq-2] ++ pslldq m0, 14 ++.expand_x: ++ punpckhbw xm0, xm1 ++ ++ ; when we reach this, m0 contains left two px in highest words ++ cmp xq, -8 ++ jle .loop_x ++.partial_load_and_extend: ++ movd m3, [srcq-4] ++ pshufb m3, [PIC_sym(pb_3)] ++ movq m2, [srcq+xq] ++ punpcklbw m2, m1 ++ punpcklbw m3, m1 ++%if ARCH_X86_64 ++ movu m4, [r10+xq*2] ++%else ++ movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] ++%endif ++ pand m2, m4 ++ pandn m4, m3 ++ por m2, m4 ++ jmp .loop_x_noload ++.right_extend: ++ pshufb m2, m0, [PIC_sym(pb_14_15)] ++ jmp .loop_x_noload ++ ++.loop_x: ++ movq m2, [srcq+xq] ++ punpcklbw m2, m1 ++.loop_x_noload: ++ palignr m3, m2, m0, 12 ++ palignr m4, m2, m0, 14 ++ ++ punpcklwd m5, m3, m2 ++ punpckhwd m6, m3, m2 ++ paddw m3, m4 ++ punpcklwd m7, m4, m1 ++ punpckhwd m4, m1 ++ pmaddwd m5, m5 ++ pmaddwd m6, m6 ++ pmaddwd m7, m7 ++ pmaddwd m4, m4 ++ paddd m5, m7 ++ paddd m6, m4 ++ paddw m3, m2 ++ movu [sumq+xq*2], m3 ++ movu [sumsqq+xq*4+ 0], m5 ++ movu [sumsqq+xq*4+16], m6 ++ ++ mova m0, m2 ++ add xq, 8 ++ ++ ; if x <= -8 we can reload more pixels ++ ; else if x < 0 we reload and extend (this implies have_right=0) ++ ; else if x < xlimd we extend from previous load (this implies have_right=0) ++ ; else we are done ++ ++ cmp xd, -8 ++ jle .loop_x ++ test xd, xd ++ jl .partial_load_and_extend ++ cmp xd, xlimd ++ jl .right_extend ++ ++ add sumsqq, (384+16)*4 ++ add sumq, (384+16)*2 ++ add srcq, strideq ++ dec hd ++ jg .loop_y ++ RET ++ ++%if ARCH_X86_64 ++cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim ++ movifnidn edged, edgem ++%else ++cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y ++ %define sumsq_baseq dword [esp+0] ++ %define sum_baseq dword [esp+4] ++ %define ylimd dword [esp+8] ++ %define m8 [esp+12] ++%endif ++ mov xq, -2 ++%if ARCH_X86_64 ++ mov ylimd, edged ++ and ylimd, 8 ; have_bottom ++ shr ylimd, 2 ++ sub ylimd, 2 ; -2 if have_bottom=0, else 0 ++ mov sumsq_baseq, sumsqq ++ mov sum_baseq, sumq ++.loop_x: ++ mov sumsqq, sumsq_baseq ++ mov sumq, sum_baseq ++ lea yd, [hd+ylimd+2] ++%else ++ mov yd, edged ++ and yd, 8 ; have_bottom ++ shr yd, 2 ++ sub yd, 2 ; -2 if have_bottom=0, else 0 ++ mov sumsq_baseq, sumsqq ++ mov sum_baseq, sumq ++ mov ylimd, yd ++.loop_x: ++ mov sumsqd, sumsq_baseq ++ mov sumd, sum_baseq ++ lea yd, [hd+2] ++ add yd, ylimd ++%endif ++ lea sumsqq, [sumsqq+xq*4+4-(384+16)*4] ++ lea sumq, [sumq+xq*2+2-(384+16)*2] ++ test edged, 4 ; have_top ++ jnz .load_top ++ movu m0, [sumsqq+(384+16)*4*1] ++ movu m1, [sumsqq+(384+16)*4*1+16] ++ mova m2, m0 ++ mova m3, m1 ++ mova m4, m0 ++ mova m5, m1 ++ movu m6, [sumq+(384+16)*2*1] ++ mova m7, m6 ++ mova m8, m6 ++ jmp .loop_y_noload ++.load_top: ++ movu m0, [sumsqq-(384+16)*4*1] ; l2sq [left] ++ movu m1, [sumsqq-(384+16)*4*1+16] ; l2sq [right] ++ movu m2, [sumsqq-(384+16)*4*0] ; l1sq [left] ++ movu m3, [sumsqq-(384+16)*4*0+16] ; l1sq [right] ++ movu m6, [sumq-(384+16)*2*1] ; l2 ++ movu m7, [sumq-(384+16)*2*0] ; l1 ++.loop_y: ++%if ARCH_X86_64 ++ movu m8, [sumq+(384+16)*2*1] ; l0 ++%else ++ movu m4, [sumq+(384+16)*2*1] ; l0 ++ mova m8, m4 ++%endif ++ movu m4, [sumsqq+(384+16)*4*1] ; l0sq [left] ++ movu m5, [sumsqq+(384+16)*4*1+16] ; l0sq [right] ++.loop_y_noload: ++ paddd m0, m2 ++ paddd m1, m3 ++ paddw m6, m7 ++ paddd m0, m4 ++ paddd m1, m5 ++ paddw m6, m8 ++ movu [sumsqq+ 0], m0 ++ movu [sumsqq+16], m1 ++ movu [sumq], m6 ++ ++ ; shift position down by one ++ mova m0, m2 ++ mova m1, m3 ++ mova m2, m4 ++ mova m3, m5 ++ mova m6, m7 ++ mova m7, m8 ++ add sumsqq, (384+16)*4 ++ add sumq, (384+16)*2 ++ dec yd ++ jg .loop_y ++ cmp yd, ylimd ++ jg .loop_y_noload ++ add xd, 8 ++ cmp xd, wd ++ jl .loop_x ++ RET ++ ++cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s ++ movifnidn sd, sm ++ sub aq, (384+16-1)*4 ++ sub bq, (384+16-1)*2 ++ add hd, 2 ++%if ARCH_X86_64 ++ LEA r5, sgr_x_by_x-0xF03 ++%else ++ SETUP_PIC r5, 0 ++%endif ++ movd m6, sd ++ pshuflw m6, m6, q0000 ++ punpcklqdq m6, m6 ++ pxor m7, m7 ++ DEFINE_ARGS a, b, w, h, x ++%if ARCH_X86_64 ++ mova m8, [pd_0xF00801C7] ++ mova m9, [pw_256] ++ psrld m10, m9, 13 ; pd_2048 ++ mova m11, [pb_unpcklwdw] ++%else ++ %define m8 [PIC_sym(pd_0xF00801C7)] ++ %define m9 [PIC_sym(pw_256)] ++ %define m10 [PIC_sym(pd_2048)] ++ %define m11 [PIC_sym(pb_unpcklwdw)] ++%endif ++.loop_y: ++ mov xq, -2 ++.loop_x: ++ movq m0, [bq+xq*2] ++ movq m1, [bq+xq*2+(384+16)*2] ++ punpcklwd m0, m7 ++ punpcklwd m1, m7 ++ movu m2, [aq+xq*4] ++ movu m3, [aq+xq*4+(384+16)*4] ++ pslld m4, m2, 3 ++ pslld m5, m3, 3 ++ paddd m2, m4 ; aa * 9 ++ paddd m3, m5 ++ pmaddwd m4, m0, m0 ++ pmaddwd m5, m1, m1 ++ pmaddwd m0, m8 ++ pmaddwd m1, m8 ++ psubd m2, m4 ; p = aa * 9 - bb * bb ++ psubd m3, m5 ++ MULLD m2, m6 ++ MULLD m3, m6 ++ paddusw m2, m8 ++ paddusw m3, m8 ++ psrld m2, 20 ; z ++ psrld m3, 20 ++ GATHERDD m4, m2 ; xx ++ GATHERDD m2, m3 ++ psrld m4, 24 ++ psrld m2, 24 ++ packssdw m3, m4, m2 ++ pshufb m4, m11 ++ MULLD m0, m4 ++ pshufb m2, m11 ++ MULLD m1, m2 ++ psubw m5, m9, m3 ++ paddd m0, m10 ++ paddd m1, m10 ++ psrld m0, 12 ++ psrld m1, 12 ++ movq [bq+xq*2], m5 ++ psrldq m5, 8 ++ movq [bq+xq*2+(384+16)*2], m5 ++ movu [aq+xq*4], m0 ++ movu [aq+xq*4+(384+16)*4], m1 ++ add xd, 4 ++ cmp xd, wd ++ jl .loop_x ++ add aq, (384+16)*4*2 ++ add bq, (384+16)*2*2 ++ sub hd, 2 ++ jg .loop_y ++ RET ++ ++%if ARCH_X86_64 ++cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ ++ tmp_base, src_base, a_base, b_base, x, y ++ movifnidn wd, wm ++ mov hd, hm ++ mova m15, [pw_16] ++ mov tmp_baseq, tq ++ mov src_baseq, srcq ++ mov a_baseq, aq ++ mov b_baseq, bq ++ xor xd, xd ++%else ++cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y ++ %define tmp_baseq [esp+8] ++ %define src_baseq [esp+12] ++ %define a_baseq [esp+16] ++ %define b_baseq [esp+20] ++ %define wd [esp+24] ++ %define hd [esp+28] ++ mov tmp_baseq, tq ++ mov src_baseq, srcq ++ mov a_baseq, aq ++ mov b_baseq, bq ++ mov wd, xd ++ mov hd, yd ++ xor xd, xd ++ SETUP_PIC yd, 1, 1 ++ jmp .loop_start ++%endif ++ ++.loop_x: ++ mov tq, tmp_baseq ++ mov srcq, src_baseq ++ mov aq, a_baseq ++ mov bq, b_baseq ++%if ARCH_X86_32 ++.loop_start: ++ movu m0, [bq+xq*2-(384+16)*2-2] ++ movu m2, [bq+xq*2-(384+16)*2+2] ++ mova m1, [bq+xq*2-(384+16)*2] ; b:top ++ paddw m0, m2 ; b:tl+tr ++ movu m2, [bq+xq*2-2] ++ movu m3, [bq+xq*2+2] ++ paddw m1, [bq+xq*2] ; b:top+ctr ++ paddw m2, m3 ; b:l+r ++ mova [esp+0x80], m0 ++ mova [esp+0x70], m1 ++ mova [esp+0x60], m2 ++%endif ++ movu m0, [aq+xq*4-(384+16)*4-4] ++ movu m2, [aq+xq*4-(384+16)*4+4] ++ mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] ++ paddd m0, m2 ; a:tl+tr [first half] ++ movu m2, [aq+xq*4-(384+16)*4-4+16] ++ movu m4, [aq+xq*4-(384+16)*4+4+16] ++ mova m3, [aq+xq*4-(384+16)*4+16] ; a:top [second half] ++ paddd m2, m4 ; a:tl+tr [second half] ++ movu m4, [aq+xq*4-4] ++ movu m5, [aq+xq*4+4] ++ paddd m1, [aq+xq*4] ; a:top+ctr [first half] ++ paddd m4, m5 ; a:l+r [first half] ++ movu m5, [aq+xq*4+16-4] ++ movu m6, [aq+xq*4+16+4] ++ paddd m3, [aq+xq*4+16] ; a:top+ctr [second half] ++ paddd m5, m6 ; a:l+r [second half] ++%if ARCH_X86_64 ++ movu m6, [bq+xq*2-(384+16)*2-2] ++ movu m8, [bq+xq*2-(384+16)*2+2] ++ mova m7, [bq+xq*2-(384+16)*2] ; b:top ++ paddw m6, m8 ; b:tl+tr ++ movu m8, [bq+xq*2-2] ++ movu m9, [bq+xq*2+2] ++ paddw m7, [bq+xq*2] ; b:top+ctr ++ paddw m8, m9 ; b:l+r ++%endif ++ ++ lea tq, [tq+xq*2] ++ lea srcq, [srcq+xq*1] ++ lea aq, [aq+xq*4+(384+16)*4] ++ lea bq, [bq+xq*2+(384+16)*2] ++ mov yd, hd ++.loop_y: ++%if ARCH_X86_64 ++ movu m9, [bq-2] ++ movu m10, [bq+2] ++ paddw m7, [bq] ; b:top+ctr+bottom ++ paddw m9, m10 ; b:bl+br ++ paddw m10, m7, m8 ; b:top+ctr+bottom+l+r ++ paddw m6, m9 ; b:tl+tr+bl+br ++ psubw m7, [bq-(384+16)*2*2] ; b:ctr+bottom ++ paddw m10, m6 ++ psllw m10, 2 ++ psubw m10, m6 ; aa ++ pxor m14, m14 ++ movq m12, [srcq] ++ punpcklbw m12, m14 ++ punpcklwd m6, m10, m15 ++ punpckhwd m10, m15 ++ punpcklwd m13, m12, m15 ++ punpckhwd m12, m15 ++ pmaddwd m6, m13 ; aa*src[x]+256 [first half] ++ pmaddwd m10, m12 ; aa*src[x]+256 [second half] ++%else ++ paddd m1, [aq] ; a:top+ctr+bottom [first half] ++ paddd m3, [aq+16] ; a:top+ctr+bottom [second half] ++ mova [esp+0x50], m1 ++ mova [esp+0x40], m3 ++ mova [esp+0x30], m4 ++ movu m6, [aq-4] ++ movu m7, [aq+4] ++ paddd m1, m4 ; a:top+ctr+bottom+l+r [first half] ++ paddd m3, m5 ; a:top+ctr+bottom+l+r [second half] ++ paddd m6, m7 ; a:bl+br [first half] ++ movu m7, [aq+16-4] ++ movu m4, [aq+16+4] ++ paddd m7, m4 ; a:bl+br [second half] ++ paddd m0, m6 ; a:tl+tr+bl+br [first half] ++ paddd m2, m7 ; a:tl+tr+bl+br [second half] ++ paddd m1, m0 ++ paddd m3, m2 ++ pslld m1, 2 ++ pslld m3, 2 ++ psubd m1, m0 ; bb [first half] ++ psubd m3, m2 ; bb [second half] ++%endif ++ ++%if ARCH_X86_64 ++ movu m11, [aq-4] ++ movu m12, [aq+4] ++ paddd m1, [aq] ; a:top+ctr+bottom [first half] ++ paddd m11, m12 ; a:bl+br [first half] ++ movu m12, [aq+16-4] ++ movu m13, [aq+16+4] ++ paddd m3, [aq+16] ; a:top+ctr+bottom [second half] ++ paddd m12, m13 ; a:bl+br [second half] ++ paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] ++ paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] ++ paddd m0, m11 ; a:tl+tr+bl+br [first half] ++ paddd m2, m12 ; a:tl+tr+bl+br [second half] ++ paddd m13, m0 ++ paddd m14, m2 ++ pslld m13, 2 ++ pslld m14, 2 ++ psubd m13, m0 ; bb [first half] ++ psubd m14, m2 ; bb [second half] ++ psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] ++ psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] ++%else ++ mova m4, [esp+0x80] ++ mova [esp+0x80], m5 ++ mova m5, [esp+0x70] ++ mova [esp+0x70], m6 ++ mova m6, [esp+0x60] ++ mova [esp+0x60], m7 ++ mova [esp+0x20], m1 ++ movu m7, [bq-2] ++ movu m1, [bq+2] ++ paddw m5, [bq] ; b:top+ctr+bottom ++ paddw m7, m1 ++ paddw m1, m5, m6 ; b:top+ctr+bottom+l+r ++ paddw m4, m7 ; b:tl+tr+bl+br ++ psubw m5, [bq-(384+16)*2*2] ; b:ctr+bottom ++ paddw m1, m4 ++ psllw m1, 2 ++ psubw m1, m4 ; aa ++ movq m0, [srcq] ++ XCHG_PIC_REG ++ punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16] ++ punpcklwd m4, m1, [PIC_sym(pw_16)] ++ punpckhwd m1, [PIC_sym(pw_16)] ++ punpcklwd m2, m0, [PIC_sym(pw_16)] ++ punpckhwd m0, [PIC_sym(pw_16)] ++ XCHG_PIC_REG ++ pmaddwd m4, m2 ; aa*src[x]+256 [first half] ++ pmaddwd m1, m0 ; aa*src[x]+256 [second half] ++%endif ++ ++%if ARCH_X86_64 ++ paddd m6, m13 ++ paddd m10, m14 ++ psrad m6, 9 ++ psrad m10, 9 ++ packssdw m6, m10 ++ mova [tq], m6 ++%else ++ paddd m4, [esp+0x20] ++ paddd m1, m3 ++ psrad m4, 9 ++ psrad m1, 9 ++ packssdw m4, m1 ++ mova [tq], m4 ++%endif ++ ++ ; shift to next row ++%if ARCH_X86_64 ++ mova m0, m4 ++ mova m2, m5 ++ mova m4, m11 ++ mova m5, m12 ++ mova m6, m8 ++ mova m8, m9 ++%else ++ mova m1, [esp+0x50] ++ mova m3, [esp+0x40] ++ mova m0, [esp+0x30] ++ mova m2, [esp+0x80] ++ mova m4, [esp+0x70] ++ mova [esp+0x70], m5 ++ mova m5, [esp+0x60] ++ mova [esp+0x80], m6 ++ mova [esp+0x60], m7 ++ psubd m1, [aq-(384+16)*4*2] ; a:ctr+bottom [first half] ++ psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] ++%endif ++ ++ add aq, (384+16)*4 ++ add bq, (384+16)*2 ++ add tq, 384*2 ++ add srcq, strideq ++ dec yd ++ jg .loop_y ++ add xd, 8 ++ cmp xd, wd ++ jl .loop_x ++ RET ++ ++cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt ++ movifnidn hd, hm ++%if ARCH_X86_32 ++ SETUP_PIC r6, 0 ++%endif ++ movd m0, wtm ++ pshufb m0, [PIC_sym(pb_0_1)] ++ psllw m0, 4 ++ pxor m7, m7 ++ DEFINE_ARGS dst, stride, t, w, h, idx ++.loop_y: ++ xor idxd, idxd ++.loop_x: ++ mova m1, [tq+idxq*2+ 0] ++ mova m4, [tq+idxq*2+16] ++ mova m5, [dstq+idxq] ++ punpcklbw m2, m5, m7 ++ punpckhbw m5, m7 ++ psllw m3, m2, 4 ++ psllw m6, m5, 4 ++ psubw m1, m3 ++ psubw m4, m6 ++ pmulhrsw m1, m0 ++ pmulhrsw m4, m0 ++ paddw m1, m2 ++ paddw m4, m5 ++ packuswb m1, m4 ++ mova [dstq+idxq], m1 ++ add idxd, 16 ++ cmp idxd, wd ++ jl .loop_x ++ add dstq, strideq ++ add tq, 384 * 2 ++ dec hd ++ jg .loop_y ++ RET ++ ++%if ARCH_X86_64 ++cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim ++ mov edged, edgem ++ movifnidn wd, wm ++ mov hd, hm ++ mova m10, [pb_0] ++ mova m11, [pb_0_1] ++%else ++cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge ++ %define edged edgemp ++ %define wd xd ++ %define wq wd ++ %define wm r5m ++ %define strideq r4m ++ SUB esp, 8 ++ SETUP_PIC sumsqd, 1, 1 ++ ++ %define m10 [PIC_sym(pb_0)] ++ %define m11 [PIC_sym(pb_0_1)] ++%endif ++ ++ test edged, 2 ; have_right ++ jz .no_right ++ xor xlimd, xlimd ++ add wd, 2 ++ add wd, 15 ++ and wd, ~15 ++ jmp .right_done ++.no_right: ++ mov xlimd, 3 ++ dec wd ++.right_done: ++ pxor m1, m1 ++ lea srcq, [srcq+wq+1] ++ lea sumq, [sumq+wq*2-2] ++ lea sumsqq, [sumsqq+wq*4-4] ++ neg wq ++%if ARCH_X86_64 ++ lea r10, [pb_right_ext_mask+16] ++%else ++ mov wm, xd ++ %define wq wm ++%endif ++ ++.loop_y: ++ mov xq, wq ++ ; load left ++ test edged, 1 ; have_left ++ jz .no_left ++ test leftq, leftq ++ jz .load_left_from_main ++ movd m0, [leftq] ++ movd m2, [srcq+xq-1] ++ pslldq m2, 4 ++ por m0, m2 ++ pslldq m0, 11 ++ add leftq, 4 ++ jmp .expand_x ++.no_left: ++ movd m0, [srcq+xq-1] ++ XCHG_PIC_REG ++ pshufb m0, m10 ++ XCHG_PIC_REG ++ jmp .expand_x ++.load_left_from_main: ++ movd m0, [srcq+xq-4] ++ pslldq m0, 12 ++.expand_x: ++ punpckhbw m0, m1 ++ ++ ; when we reach this, m0 contains left two px in highest words ++ cmp xq, -8 ++ jle .loop_x ++ test xq, xq ++ jge .right_extend ++.partial_load_and_extend: ++ XCHG_PIC_REG ++ movd m3, [srcq-1] ++ movq m2, [srcq+xq] ++ pshufb m3, m10 ++ punpcklbw m3, m1 ++ punpcklbw m2, m1 ++%if ARCH_X86_64 ++ movu m4, [r10+xq*2] ++%else ++ movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] ++ XCHG_PIC_REG ++%endif ++ pand m2, m4 ++ pandn m4, m3 ++ por m2, m4 ++ jmp .loop_x_noload ++.right_extend: ++ psrldq m2, m0, 14 ++ XCHG_PIC_REG ++ pshufb m2, m11 ++ XCHG_PIC_REG ++ jmp .loop_x_noload ++ ++.loop_x: ++ movq m2, [srcq+xq] ++ punpcklbw m2, m1 ++.loop_x_noload: ++ palignr m3, m2, m0, 8 ++ palignr m4, m2, m0, 10 ++ palignr m5, m2, m0, 12 ++ palignr m6, m2, m0, 14 ++ ++%if ARCH_X86_64 ++ paddw m0, m3, m2 ++ punpcklwd m7, m3, m2 ++ punpckhwd m3, m2 ++ paddw m0, m4 ++ punpcklwd m8, m4, m5 ++ punpckhwd m4, m5 ++ paddw m0, m5 ++ punpcklwd m9, m6, m1 ++ punpckhwd m5, m6, m1 ++ paddw m0, m6 ++ pmaddwd m7, m7 ++ pmaddwd m3, m3 ++ pmaddwd m8, m8 ++ pmaddwd m4, m4 ++ pmaddwd m9, m9 ++ pmaddwd m5, m5 ++ paddd m7, m8 ++ paddd m3, m4 ++ paddd m7, m9 ++ paddd m3, m5 ++ movu [sumq+xq*2], m0 ++ movu [sumsqq+xq*4+ 0], m7 ++ movu [sumsqq+xq*4+16], m3 ++%else ++ paddw m0, m3, m2 ++ paddw m0, m4 ++ paddw m0, m5 ++ paddw m0, m6 ++ movu [sumq+xq*2], m0 ++ punpcklwd m7, m3, m2 ++ punpckhwd m3, m2 ++ punpcklwd m0, m4, m5 ++ punpckhwd m4, m5 ++ punpckhwd m5, m6, m1 ++ pmaddwd m7, m7 ++ pmaddwd m3, m3 ++ pmaddwd m0, m0 ++ pmaddwd m4, m4 ++ pmaddwd m5, m5 ++ paddd m7, m0 ++ paddd m3, m4 ++ paddd m3, m5 ++ punpcklwd m0, m6, m1 ++ pmaddwd m0, m0 ++ paddd m7, m0 ++ movu [sumsqq+xq*4+ 0], m7 ++ movu [sumsqq+xq*4+16], m3 ++%endif ++ ++ mova m0, m2 ++ add xq, 8 ++ ++ ; if x <= -8 we can reload more pixels ++ ; else if x < 0 we reload and extend (this implies have_right=0) ++ ; else if x < xlimd we extend from previous load (this implies have_right=0) ++ ; else we are done ++ ++ cmp xq, -8 ++ jle .loop_x ++ test xq, xq ++ jl .partial_load_and_extend ++ cmp xq, xlimq ++ jl .right_extend ++ ++ add sumsqq, (384+16)*4 ++ add sumq, (384+16)*2 ++ add srcq, strideq ++ dec hd ++ jg .loop_y ++%if ARCH_X86_32 ++ ADD esp, 8 ++%endif ++ RET ++ ++%if ARCH_X86_64 ++cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim ++ movifnidn edged, edgem ++ mov ylimd, edged ++%else ++cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr ++ %define wm [esp+0] ++ %define hm [esp+4] ++ %define edgem [esp+8] ++ mov wm, xd ++ mov hm, yd ++ mov edgem, ylimd ++%endif ++ ++ and ylimd, 8 ; have_bottom ++ shr ylimd, 2 ++ sub ylimd, 3 ; -3 if have_bottom=0, else -1 ++ mov xq, -2 ++%if ARCH_X86_64 ++.loop_x: ++ lea yd, [hd+ylimd+2] ++ lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] ++ lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2] ++ test edged, 4 ; have_top ++ jnz .load_top ++ movu m0, [sumsq_ptrq+(384+16)*4*1] ++ movu m1, [sumsq_ptrq+(384+16)*4*1+16] ++ mova m2, m0 ++ mova m3, m1 ++ mova m4, m0 ++ mova m5, m1 ++ mova m6, m0 ++ mova m7, m1 ++ movu m10, [sum_ptrq+(384+16)*2*1] ++ mova m11, m10 ++ mova m12, m10 ++ mova m13, m10 ++ jmp .loop_y_second_load ++.load_top: ++ movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] ++ movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] ++ movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] ++ movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] ++ mova m2, m0 ++ mova m3, m1 ++ movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 ++ movu m12, [sum_ptrq-(384+16)*2*0] ; l2 ++ mova m11, m10 ++.loop_y: ++ movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] ++ movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] ++ movu m13, [sum_ptrq+(384+16)*2*1] ; l1 ++.loop_y_second_load: ++ test yd, yd ++ jle .emulate_second_load ++ movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] ++ movu m9, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] ++ movu m14, [sum_ptrq+(384+16)*2*2] ; l0 ++.loop_y_noload: ++ paddd m0, m2 ++ paddd m1, m3 ++ paddw m10, m11 ++ paddd m0, m4 ++ paddd m1, m5 ++ paddw m10, m12 ++ paddd m0, m6 ++ paddd m1, m7 ++ paddw m10, m13 ++ paddd m0, m8 ++ paddd m1, m9 ++ paddw m10, m14 ++ movu [sumsq_ptrq+ 0], m0 ++ movu [sumsq_ptrq+16], m1 ++ movu [sum_ptrq], m10 ++ ++ ; shift position down by one ++ mova m0, m4 ++ mova m1, m5 ++ mova m2, m6 ++ mova m3, m7 ++ mova m4, m8 ++ mova m5, m9 ++ mova m10, m12 ++ mova m11, m13 ++ mova m12, m14 ++ add sumsq_ptrq, (384+16)*4*2 ++ add sum_ptrq, (384+16)*2*2 ++ sub yd, 2 ++ jge .loop_y ++ ; l1 = l0 ++ mova m6, m8 ++ mova m7, m9 ++ mova m13, m14 ++ cmp yd, ylimd ++ jg .loop_y_noload ++ add xd, 8 ++ cmp xd, wd ++ jl .loop_x ++ RET ++.emulate_second_load: ++ mova m8, m6 ++ mova m9, m7 ++ mova m14, m13 ++ jmp .loop_y_noload ++%else ++.sumsq_loop_x: ++ lea yd, [ylimd+2] ++ add yd, hm ++ lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] ++ test dword edgem, 4 ; have_top ++ jnz .sumsq_load_top ++ movu m0, [sumsq_ptrq+(384+16)*4*1] ++ movu m1, [sumsq_ptrq+(384+16)*4*1+16] ++ mova m4, m0 ++ mova m5, m1 ++ mova m6, m0 ++ mova m7, m1 ++ mova [esp+0x1c], m0 ++ mova [esp+0x0c], m1 ++ jmp .sumsq_loop_y_second_load ++.sumsq_load_top: ++ movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] ++ movu m1, [sumsq_ptrq-(384+16)*4*1+16] ; l3/4sq [right] ++ movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] ++ movu m5, [sumsq_ptrq-(384+16)*4*0+16] ; l2sq [right] ++ mova [esp+0x1c], m0 ++ mova [esp+0x0c], m1 ++.sumsq_loop_y: ++ movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] ++ movu m7, [sumsq_ptrq+(384+16)*4*1+16] ; l1sq [right] ++.sumsq_loop_y_second_load: ++ test yd, yd ++ jle .sumsq_emulate_second_load ++ movu m2, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] ++ movu m3, [sumsq_ptrq+(384+16)*4*2+16] ; l0sq [right] ++.sumsq_loop_y_noload: ++ paddd m0, [esp+0x1c] ++ paddd m1, [esp+0x0c] ++ paddd m0, m4 ++ paddd m1, m5 ++ paddd m0, m6 ++ paddd m1, m7 ++ paddd m0, m2 ++ paddd m1, m3 ++ movu [sumsq_ptrq+ 0], m0 ++ movu [sumsq_ptrq+16], m1 ++ ++ ; shift position down by one ++ mova m0, m4 ++ mova m1, m5 ++ mova m4, m2 ++ mova m5, m3 ++ mova [esp+0x1c], m6 ++ mova [esp+0x0c], m7 ++ add sumsq_ptrq, (384+16)*4*2 ++ sub yd, 2 ++ jge .sumsq_loop_y ++ ; l1 = l0 ++ mova m6, m2 ++ mova m7, m3 ++ cmp yd, ylimd ++ jg .sumsq_loop_y_noload ++ add xd, 8 ++ cmp xd, wm ++ jl .sumsq_loop_x ++ ++ mov xd, -2 ++.sum_loop_x: ++ lea yd, [ylimd+2] ++ add yd, hm ++ lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] ++ test dword edgem, 4 ; have_top ++ jnz .sum_load_top ++ movu m0, [sum_ptrq+(384+16)*2*1] ++ mova m1, m0 ++ mova m2, m0 ++ mova m3, m0 ++ jmp .sum_loop_y_second_load ++.sum_load_top: ++ movu m0, [sum_ptrq-(384+16)*2*1] ; l3/4 ++ movu m2, [sum_ptrq-(384+16)*2*0] ; l2 ++ mova m1, m0 ++.sum_loop_y: ++ movu m3, [sum_ptrq+(384+16)*2*1] ; l1 ++.sum_loop_y_second_load: ++ test yd, yd ++ jle .sum_emulate_second_load ++ movu m4, [sum_ptrq+(384+16)*2*2] ; l0 ++.sum_loop_y_noload: ++ paddw m0, m1 ++ paddw m0, m2 ++ paddw m0, m3 ++ paddw m0, m4 ++ movu [sum_ptrq], m0 ++ ++ ; shift position down by one ++ mova m0, m2 ++ mova m1, m3 ++ mova m2, m4 ++ add sum_ptrq, (384+16)*2*2 ++ sub yd, 2 ++ jge .sum_loop_y ++ ; l1 = l0 ++ mova m3, m4 ++ cmp yd, ylimd ++ jg .sum_loop_y_noload ++ add xd, 8 ++ cmp xd, wm ++ jl .sum_loop_x ++ RET ++.sumsq_emulate_second_load: ++ mova m2, m6 ++ mova m3, m7 ++ jmp .sumsq_loop_y_noload ++.sum_emulate_second_load: ++ mova m4, m3 ++ jmp .sum_loop_y_noload ++%endif ++ ++cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s ++ movifnidn sd, sm ++ sub aq, (384+16-1)*4 ++ sub bq, (384+16-1)*2 ++ add hd, 2 ++%if ARCH_X86_64 ++ LEA r5, sgr_x_by_x-0xF03 ++%else ++ SETUP_PIC r5, 0 ++%endif ++ movd m6, sd ++ pshuflw m6, m6, q0000 ++ punpcklqdq m6, m6 ++ pxor m7, m7 ++ DEFINE_ARGS a, b, w, h, x ++%if ARCH_X86_64 ++ mova m8, [pd_0xF0080029] ++ mova m9, [pw_256] ++ psrld m10, m9, 15 ; pd_512 ++%else ++ %define m8 [PIC_sym(pd_0xF0080029)] ++ %define m9 [PIC_sym(pw_256)] ++ %define m10 [PIC_sym(pd_512)] ++%endif ++.loop_y: ++ mov xq, -2 ++.loop_x: ++ movq m0, [bq+xq*2+0] ++ movq m1, [bq+xq*2+8] ++ punpcklwd m0, m7 ++ punpcklwd m1, m7 ++ movu m2, [aq+xq*4+ 0] ++ movu m3, [aq+xq*4+16] ++ pslld m4, m2, 3 ; aa * 8 ++ pslld m5, m3, 3 ++ paddd m2, m4 ; aa * 9 ++ paddd m3, m5 ++ paddd m4, m4 ; aa * 16 ++ paddd m5, m5 ++ paddd m2, m4 ; aa * 25 ++ paddd m3, m5 ++ pmaddwd m4, m0, m0 ++ pmaddwd m5, m1, m1 ++ psubd m2, m4 ; p = aa * 25 - bb * bb ++ psubd m3, m5 ++ MULLD m2, m6 ++ MULLD m3, m6 ++ paddusw m2, m8 ++ paddusw m3, m8 ++ psrld m2, 20 ; z ++ psrld m3, 20 ++ GATHERDD m4, m2 ; xx ++ GATHERDD m2, m3 ++ psrld m4, 24 ++ psrld m2, 24 ++ packssdw m3, m4, m2 ++ pmullw m4, m8 ++ pmullw m2, m8 ++ psubw m5, m9, m3 ++ pmaddwd m0, m4 ++ pmaddwd m1, m2 ++ paddd m0, m10 ++ paddd m1, m10 ++ psrld m0, 10 ++ psrld m1, 10 ++ movu [bq+xq*2], m5 ++ movu [aq+xq*4+ 0], m0 ++ movu [aq+xq*4+16], m1 ++ add xd, 8 ++ cmp xd, wd ++ jl .loop_x ++ add aq, (384+16)*4*2 ++ add bq, (384+16)*2*2 ++ sub hd, 2 ++ jg .loop_y ++ RET ++ ++%if ARCH_X86_64 ++cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ ++ tmp_base, src_base, a_base, b_base, x, y ++ movifnidn wd, wm ++ mov hd, hm ++ mov tmp_baseq, tq ++ mov src_baseq, srcq ++ mov a_baseq, aq ++ mov b_baseq, bq ++ mova m9, [pw_5_6] ++ mova m12, [pw_256] ++ psrlw m10, m12, 8 ; pw_1 ++ psrlw m11, m12, 1 ; pw_128 ++ pxor m13, m13 ++%else ++cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y ++ %define tmp_baseq r0m ++ %define src_baseq r1m ++ %define a_baseq r3m ++ %define b_baseq r4m ++ %define wd r5m ++ %define hd r6m ++ ++ SUB esp, 8 ++ SETUP_PIC yd ++ ++ %define m8 m5 ++ %define m9 [PIC_sym(pw_5_6)] ++ %define m10 [PIC_sym(pw_1)] ++ %define m11 [PIC_sym(pw_128)] ++ %define m12 [PIC_sym(pw_256)] ++ %define m13 m0 ++%endif ++ xor xd, xd ++.loop_x: ++ mov tq, tmp_baseq ++ mov srcq, src_baseq ++ mov aq, a_baseq ++ mov bq, b_baseq ++ movu m0, [aq+xq*4-(384+16)*4-4] ++ mova m1, [aq+xq*4-(384+16)*4] ++ movu m2, [aq+xq*4-(384+16)*4+4] ++ movu m3, [aq+xq*4-(384+16)*4-4+16] ++ mova m4, [aq+xq*4-(384+16)*4+16] ++ movu m5, [aq+xq*4-(384+16)*4+4+16] ++ paddd m0, m2 ++ paddd m3, m5 ++ paddd m0, m1 ++ paddd m3, m4 ++ pslld m2, m0, 2 ++ pslld m5, m3, 2 ++ paddd m2, m0 ++ paddd m5, m3 ++ paddd m0, m2, m1 ; prev_odd_b [first half] ++ paddd m1, m5, m4 ; prev_odd_b [second half] ++ movu m3, [bq+xq*2-(384+16)*2-2] ++ mova m4, [bq+xq*2-(384+16)*2] ++ movu m5, [bq+xq*2-(384+16)*2+2] ++ paddw m3, m5 ++ punpcklwd m5, m3, m4 ++ punpckhwd m3, m4 ++ pmaddwd m5, m9 ++ pmaddwd m3, m9 ++ mova m2, m5 ++ packssdw m2, m3 ; prev_odd_a ++ lea tq, [tq+xq*2] ++ lea srcq, [srcq+xq*1] ++ lea aq, [aq+xq*4+(384+16)*4] ++ lea bq, [bq+xq*2+(384+16)*2] ++%if ARCH_X86_32 ++ mov [esp], PIC_reg ++%endif ++ mov yd, hd ++ XCHG_PIC_REG ++.loop_y: ++ movu m3, [aq-4] ++ mova m4, [aq] ++ movu m5, [aq+4] ++ paddd m3, m5 ++ paddd m3, m4 ++ pslld m5, m3, 2 ++ paddd m5, m3 ++ paddd m5, m4 ; cur_odd_b [first half] ++ movu m3, [aq+16-4] ++ mova m6, [aq+16] ++ movu m7, [aq+16+4] ++ paddd m3, m7 ++ paddd m3, m6 ++ pslld m7, m3, 2 ++ paddd m7, m3 ++ paddd m4, m7, m6 ; cur_odd_b [second half] ++ movu m3, [bq-2] ++ mova m6, [bq] ++ movu m7, [bq+2] ++ paddw m3, m7 ++ punpcklwd m7, m3, m6 ++ punpckhwd m3, m6 ++ pmaddwd m7, m9 ++ pmaddwd m3, m9 ++ packssdw m6, m7, m3 ; cur_odd_a ++ ++ paddd m0, m5 ; cur_even_b [first half] ++ paddd m1, m4 ; cur_even_b [second half] ++ paddw m2, m6 ; cur_even_a ++ ++ movq m3, [srcq] ++%if ARCH_X86_64 ++ punpcklbw m3, m13 ++%else ++ mova [td], m5 ++ pxor m7, m7 ++ punpcklbw m3, m7 ++%endif ++ punpcklwd m7, m3, m10 ++ punpckhwd m3, m10 ++ punpcklwd m8, m2, m12 ++ punpckhwd m2, m12 ++ pmaddwd m7, m8 ++ pmaddwd m3, m2 ++ paddd m7, m0 ++ paddd m3, m1 ++ psrad m7, 9 ++ psrad m3, 9 ++ ++%if ARCH_X86_32 ++ pxor m13, m13 ++%endif ++ movq m8, [srcq+strideq] ++ punpcklbw m8, m13 ++ punpcklwd m0, m8, m10 ++ punpckhwd m8, m10 ++ punpcklwd m1, m6, m11 ++ punpckhwd m2, m6, m11 ++ pmaddwd m0, m1 ++ pmaddwd m8, m2 ++%if ARCH_X86_64 ++ paddd m0, m5 ++%else ++ paddd m0, [td] ++%endif ++ paddd m8, m4 ++ psrad m0, 8 ++ psrad m8, 8 ++ ++ packssdw m7, m3 ++ packssdw m0, m8 ++%if ARCH_X86_32 ++ mova m5, [td] ++%endif ++ mova [tq+384*2*0], m7 ++ mova [tq+384*2*1], m0 ++ ++ mova m0, m5 ++ mova m1, m4 ++ mova m2, m6 ++ add aq, (384+16)*4*2 ++ add bq, (384+16)*2*2 ++ add tq, 384*2*2 ++ lea srcq, [srcq+strideq*2] ++%if ARCH_X86_64 ++ sub yd, 2 ++%else ++ sub dword [esp+4], 2 ++%endif ++ jg .loop_y ++ add xd, 8 ++ cmp xd, wd ++ jl .loop_x ++%if ARCH_X86_32 ++ ADD esp, 8 ++%endif ++ RET ++ ++cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt ++ movifnidn wd, wm ++ mov wtq, wtmp ++%if ARCH_X86_64 ++ movifnidn hd, hm ++ mova m10, [pd_1024] ++ pxor m11, m11 ++%else ++ SETUP_PIC hd, 0 ++ %define m10 [PIC_sym(pd_1024)] ++ %define m11 m7 ++%endif ++ movd m0, [wtq] ++ pshufd m0, m0, 0 ++ DEFINE_ARGS dst, stride, t1, t2, w, h, idx ++%if ARCH_X86_32 ++ %define hd hmp ++%endif ++ ++.loop_y: ++ xor idxd, idxd ++.loop_x: ++ mova m1, [t1q+idxq*2+ 0] ++ mova m2, [t1q+idxq*2+16] ++ mova m3, [t2q+idxq*2+ 0] ++ mova m4, [t2q+idxq*2+16] ++ mova m6, [dstq+idxq] ++%if ARCH_X86_32 ++ pxor m11, m11 ++%endif ++ punpcklbw m5, m6, m11 ++ punpckhbw m6, m11 ++ psllw m7, m5, 4 ++ psubw m1, m7 ++ psubw m3, m7 ++ psllw m7, m6, 4 ++ psubw m2, m7 ++ psubw m4, m7 ++ punpcklwd m7, m1, m3 ++ punpckhwd m1, m3 ++ punpcklwd m3, m2, m4 ++ punpckhwd m2, m4 ++ pmaddwd m7, m0 ++ pmaddwd m1, m0 ++ pmaddwd m3, m0 ++ pmaddwd m2, m0 ++ paddd m7, m10 ++ paddd m1, m10 ++ paddd m3, m10 ++ paddd m2, m10 ++ psrad m7, 11 ++ psrad m1, 11 ++ psrad m3, 11 ++ psrad m2, 11 ++ packssdw m7, m1 ++ packssdw m3, m2 ++ paddw m7, m5 ++ paddw m3, m6 ++ packuswb m7, m3 ++ mova [dstq+idxq], m7 ++ add idxd, 16 ++ cmp idxd, wd ++ jl .loop_x ++ add dstq, strideq ++ add t1q, 384 * 2 ++ add t2q, 384 * 2 ++ dec hd ++ jg .loop_y ++ RET +diff --git third_party/dav1d/src/x86/mc.asm third_party/dav1d/src/x86/mc.asm +index d609d2e3be15..26130eeac993 100644 +--- third_party/dav1d/src/x86/mc.asm ++++ third_party/dav1d/src/x86/mc.asm +@@ -62,13 +62,12 @@ deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 + blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 + + pb_64: times 4 db 64 +-pw_8: times 2 dw 8 +-pw_26: times 2 dw 26 + pw_34: times 2 dw 34 + pw_258: times 2 dw 258 + pw_512: times 2 dw 512 + pw_1024: times 2 dw 1024 + pw_2048: times 2 dw 2048 ++pw_6903: times 2 dw 6903 + pw_8192: times 2 dw 8192 + pd_32: dd 32 + pd_512: dd 512 +@@ -92,6 +91,8 @@ BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 + BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 + BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 + BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 ++BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 + BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 + BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 + BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 +@@ -265,7 +266,6 @@ INIT_YMM avx2 + ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 + imul mxyd, 0xff01 + vbroadcasti128 m4, [bilin_h_shuf8] +- WIN64_SPILL_XMM 7 + add mxyd, 16 << 8 + movd xm5, mxyd + mov mxyd, r7m ; my +@@ -273,7 +273,7 @@ INIT_YMM avx2 + test mxyd, mxyd + jnz .hv + movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)] +- vpbroadcastd m6, [pw_2048] ++ vpbroadcastd m3, [pw_2048] + add wq, t2 + jmp wq + .h_w2: +@@ -282,7 +282,7 @@ INIT_YMM avx2 + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 +- pmulhrsw xm0, xm6 ++ pmulhrsw xm0, xm3 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 2 +@@ -298,7 +298,7 @@ INIT_YMM avx2 + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 +- pmulhrsw xm0, xm6 ++ pmulhrsw xm0, xm3 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 +@@ -314,8 +314,8 @@ INIT_YMM avx2 + pshufb xm1, xm4 + pmaddubsw xm0, xm5 + pmaddubsw xm1, xm5 +- pmulhrsw xm0, xm6 +- pmulhrsw xm1, xm6 ++ pmulhrsw xm0, xm3 ++ pmulhrsw xm1, xm3 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 +@@ -333,8 +333,8 @@ INIT_YMM avx2 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- pmulhrsw m0, m6 +- pmulhrsw m1, m6 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 +@@ -350,8 +350,8 @@ INIT_YMM avx2 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- pmulhrsw m0, m6 +- pmulhrsw m1, m6 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq +@@ -361,25 +361,25 @@ INIT_YMM avx2 + .h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] +- movu m2, [srcq+8*4] +- movu m3, [srcq+8*5] +- add srcq, ssq + pshufb m0, m4 + pshufb m1, m4 +- pshufb m2, m4 +- pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- pmaddubsw m2, m5 +- pmaddubsw m3, m5 +- pmulhrsw m0, m6 +- pmulhrsw m1, m6 +- pmulhrsw m2, m6 +- pmulhrsw m3, m6 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 + packuswb m0, m1 +- packuswb m2, m3 ++ movu m1, [srcq+8*4] ++ movu m2, [srcq+8*5] ++ add srcq, ssq ++ pshufb m1, m4 ++ pshufb m2, m4 ++ pmaddubsw m1, m5 ++ pmaddubsw m2, m5 ++ pmulhrsw m1, m3 ++ pmulhrsw m2, m3 ++ packuswb m1, m2 + mova [dstq+32*0], m0 +- mova [dstq+32*1], m2 ++ mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .h_w64 +@@ -393,8 +393,8 @@ INIT_YMM avx2 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- pmulhrsw m0, m6 +- pmulhrsw m1, m6 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+t1+32*3], m0 + add t1, 32 +@@ -406,14 +406,12 @@ INIT_YMM avx2 + RET + .v: + movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)] +- %assign stack_offset stack_offset - stack_size_padded +- WIN64_SPILL_XMM 8 + imul mxyd, 0xff01 +- vpbroadcastd m7, [pw_2048] ++ vpbroadcastd m5, [pw_2048] + add mxyd, 16 << 8 + add wq, t2 +- movd xm6, mxyd +- vpbroadcastw m6, xm6 ++ movd xm4, mxyd ++ vpbroadcastw m4, xm4 + jmp wq + .v_w2: + movd xm0, [srcq+ssq*0] +@@ -423,8 +421,8 @@ INIT_YMM avx2 + pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 + pshuflw xm1, xm1, q2301 ; 1 0 + punpcklbw xm1, xm0, xm1 +- pmaddubsw xm1, xm6 +- pmulhrsw xm1, xm7 ++ pmaddubsw xm1, xm4 ++ pmulhrsw xm1, xm5 + packuswb xm1, xm1 + pextrw [dstq+dsq*0], xm1, 1 + pextrw [dstq+dsq*1], xm1, 0 +@@ -441,8 +439,8 @@ INIT_YMM avx2 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm1, xm1, xm0, 0x02 ; 1 2 + punpcklbw xm1, xm2 +- pmaddubsw xm1, xm6 +- pmulhrsw xm1, xm7 ++ pmaddubsw xm1, xm4 ++ pmulhrsw xm1, xm5 + packuswb xm1, xm1 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 +@@ -453,20 +451,18 @@ INIT_YMM avx2 + .v_w8: + movq xm0, [srcq+ssq*0] + .v_w8_loop: +- vpbroadcastq xm1, [srcq+ssq*1] ++ movq xm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +- vpblendd xm2, xm1, xm0, 0x03 ; 0 1 +- vpbroadcastq xm0, [srcq+ssq*0] +- vpblendd xm1, xm1, xm0, 0x0c ; 1 2 +- punpcklbw xm3, xm1, xm2 +- punpckhbw xm1, xm2 +- pmaddubsw xm3, xm6 +- pmaddubsw xm1, xm6 +- pmulhrsw xm3, xm7 +- pmulhrsw xm1, xm7 +- packuswb xm3, xm1 +- movq [dstq+dsq*0], xm3 +- movhps [dstq+dsq*1], xm3 ++ punpcklbw xm1, xm3, xm0 ++ movq xm0, [srcq+ssq*0] ++ punpcklbw xm2, xm0, xm3 ++ pmaddubsw xm1, xm4 ++ pmaddubsw xm2, xm4 ++ pmulhrsw xm1, xm5 ++ pmulhrsw xm2, xm5 ++ packuswb xm1, xm2 ++ movq [dstq+dsq*0], xm1 ++ movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop +@@ -481,10 +477,10 @@ INIT_YMM avx2 + vpblendd m2, m2, m0, 0xf0 ; 1 2 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 +- pmaddubsw m1, m6 +- pmaddubsw m2, m6 +- pmulhrsw m1, m7 +- pmulhrsw m2, m7 ++ pmaddubsw m1, m4 ++ pmaddubsw m2, m4 ++ pmulhrsw m1, m5 ++ pmulhrsw m2, m5 + packuswb m1, m2 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 +@@ -496,25 +492,25 @@ INIT_YMM avx2 + %macro PUT_BILIN_V_W32 0 + movu m0, [srcq+ssq*0] + %%loop: +- movu m4, [srcq+ssq*1] ++ movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +- punpcklbw m1, m4, m0 +- punpckhbw m3, m4, m0 ++ punpcklbw m1, m3, m0 ++ punpckhbw m2, m3, m0 + movu m0, [srcq+ssq*0] +- punpcklbw m2, m0, m4 +- punpckhbw m4, m0, m4 +- pmaddubsw m1, m6 +- pmaddubsw m3, m6 +- pmaddubsw m2, m6 +- pmaddubsw m4, m6 +- pmulhrsw m1, m7 +- pmulhrsw m3, m7 +- pmulhrsw m2, m7 +- pmulhrsw m4, m7 +- packuswb m1, m3 +- packuswb m2, m4 ++ pmaddubsw m1, m4 ++ pmaddubsw m2, m4 ++ pmulhrsw m1, m5 ++ pmulhrsw m2, m5 ++ packuswb m1, m2 + mova [dstq+dsq*0], m1 +- mova [dstq+dsq*1], m2 ++ punpcklbw m1, m0, m3 ++ punpckhbw m2, m0, m3 ++ pmaddubsw m1, m4 ++ pmaddubsw m2, m4 ++ pmulhrsw m1, m5 ++ pmulhrsw m2, m5 ++ packuswb m1, m2 ++ mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg %%loop +@@ -527,25 +523,25 @@ INIT_YMM avx2 + .v_w64_loop: + add srcq, ssq + movu m3, [srcq+32*0] +- movu m4, [srcq+32*1] + punpcklbw m2, m3, m0 +- punpckhbw m5, m3, m0 +- pmaddubsw m2, m6 +- pmaddubsw m5, m6 ++ punpckhbw m0, m3, m0 ++ pmaddubsw m2, m4 ++ pmaddubsw m0, m4 ++ pmulhrsw m2, m5 ++ pmulhrsw m0, m5 ++ packuswb m2, m0 + mova m0, m3 +- pmulhrsw m2, m7 +- pmulhrsw m5, m7 +- packuswb m2, m5 +- punpcklbw m3, m4, m1 +- punpckhbw m5, m4, m1 +- pmaddubsw m3, m6 +- pmaddubsw m5, m6 +- mova m1, m4 +- pmulhrsw m3, m7 +- pmulhrsw m5, m7 +- packuswb m3, m5 ++ movu m3, [srcq+32*1] + mova [dstq+32*0], m2 +- mova [dstq+32*1], m3 ++ punpcklbw m2, m3, m1 ++ punpckhbw m1, m3, m1 ++ pmaddubsw m2, m4 ++ pmaddubsw m1, m4 ++ pmulhrsw m2, m5 ++ pmulhrsw m1, m5 ++ packuswb m2, m1 ++ mova m1, m3 ++ mova [dstq+32*1], m2 + add dstq, dsq + dec hd + jg .v_w64_loop +@@ -568,7 +564,6 @@ INIT_YMM avx2 + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 + ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 + movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)] +- %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 8 + shl mxyd, 11 ; can't shift by 12 due to signed overflow + vpbroadcastd m7, [pw_2048] +@@ -684,7 +679,14 @@ INIT_YMM avx2 + jg .hv_w16_loop + RET + .hv_w32: +-%macro PUT_BILIN_HV_W32 0 ++ xor t2d, t2d ++.hv_w32gt: ++ mov t0, dstq ++ mov t1, srcq ++%if WIN64 ++ movaps r4m, xmm8 ++%endif ++.hv_w32_loop0: + movu m0, [srcq+8*0] + vinserti128 m0, m0, [srcq+8*2], 1 + movu m1, [srcq+8*1] +@@ -693,10 +695,7 @@ INIT_YMM avx2 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +-%if WIN64 +- movaps r4m, xmm8 +-%endif +-%%loop: ++.hv_w32_loop: + add srcq, ssq + movu xm2, [srcq+8*1] + vinserti128 m2, m2, [srcq+8*3], 1 +@@ -722,41 +721,24 @@ INIT_YMM avx2 + mova [dstq], m3 + add dstq, dsq + dec hd +- jg %%loop +-%if WIN64 +- movaps xmm8, r4m +-%endif +-%endmacro +- PUT_BILIN_HV_W32 +- RET +-.hv_w64: +- mov t0, dstq +- mov t1, srcq +- lea t2d, [hq+(1<<8)] +-.hv_w64_loop: +- PUT_BILIN_HV_W32 +- mov hb, t2b ++ jg .hv_w32_loop ++ movzx hd, t2b + add t0, 32 + add t1, 32 + mov dstq, t0 + mov srcq, t1 + sub t2d, 1<<8 +- jg .hv_w64_loop ++ jg .hv_w32_loop0 ++%if WIN64 ++ movaps xmm8, r4m ++%endif + RET ++.hv_w64: ++ lea t2d, [hq+(1<<8)] ++ jmp .hv_w32gt + .hv_w128: +- mov t0, dstq +- mov t1, srcq + lea t2d, [hq+(3<<8)] +-.hv_w128_loop: +- PUT_BILIN_HV_W32 +- mov hb, t2b +- add t0, 32 +- add t1, 32 +- mov dstq, t0 +- mov srcq, t1 +- sub t2d, 1<<8 +- jg .hv_w128_loop +- RET ++ jmp .hv_w32gt + + DECLARE_REG_TMP 3, 5, 6 + cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 +@@ -2997,13 +2979,12 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 + %macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 +- ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 +- mova m0, [tmp2q+(%1+0)*mmsize] +- psubw m2, m0, [tmp1q+(%1+0)*mmsize] +- mova m1, [tmp2q+(%1+1)*mmsize] +- psubw m3, m1, [tmp1q+(%1+1)*mmsize] +- paddw m2, m2 ; compensate for the weight only being half +- paddw m3, m3 ; of what it should be ++ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ++ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 ++ mova m0, [tmp1q+(%1+0)*mmsize] ++ psubw m2, m0, [tmp2q+(%1+0)*mmsize] ++ mova m1, [tmp1q+(%1+1)*mmsize] ++ psubw m3, m1, [tmp2q+(%1+1)*mmsize] + pmulhw m2, m4 + pmulhw m3, m4 + paddw m0, m2 +@@ -3019,13 +3000,19 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 + lea r6, [w_avg_avx2_table] + tzcnt wd, wm + movifnidn hd, hm +- vpbroadcastw m0, r6m ; weight ++ vpbroadcastw m4, r6m ; weight + movsxd wq, dword [r6+wq*4] +- pxor m4, m4 +- psllw m0, 11 ; can't shift by 12, sign bit must be preserved +- psubw m4, m0 + vpbroadcastd m5, [pw_2048+r6-w_avg_avx2_table] ++ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 ++ cmp dword r6m, 7 ++ jg .weight_gt7 ++ mov r6, tmp1q ++ pxor m0, m0 ++ mov tmp1q, tmp2q ++ psubw m4, m0, m4 ; -weight ++ mov tmp2q, r6 ++.weight_gt7: + BIDIR_FN W_AVG + + %macro MASK 1 ; src_offset +@@ -3069,14 +3056,13 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 + add wq, r7 + BIDIR_FN MASK + +-%macro W_MASK_420 2 ; src_offset, mask_out ++%macro W_MASK 2-3 0 ; src_offset, mask_out, 4:4:4 + mova m0, [tmp1q+(%1+0)*mmsize] + mova m1, [tmp2q+(%1+0)*mmsize] + psubw m1, m0 + pabsw m%2, m1 +- paddw m%2, m6 +- psrlw m%2, 8 ; (abs(tmp1 - tmp2) + 8) >> 8 +- psubusw m%2, m7, m%2 ; 64 - min(m, 64) ++ psubusw m%2, m6, m%2 ++ psrlw m%2, 8 ; 64 - m + psllw m2, m%2, 10 + pmulhw m1, m2 + paddw m0, m1 +@@ -3084,33 +3070,39 @@ cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 + mova m2, [tmp2q+(%1+1)*mmsize] + psubw m2, m1 + pabsw m3, m2 +- paddw m3, m6 ++ psubusw m3, m6, m3 + psrlw m3, 8 +- psubusw m3, m7, m3 ++%if %3 ++ packuswb m%2, m3 ++ psubb m%2, m5, m%2 ++ vpermq m%2, m%2, q3120 ++%else + phaddw m%2, m3 ++%endif + psllw m3, 10 + pmulhw m2, m3 + paddw m1, m2 +- pmulhrsw m0, m8 +- pmulhrsw m1, m8 ++ pmulhrsw m0, m7 ++ pmulhrsw m1, m7 + packuswb m0, m1 + %endmacro + +-cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-w_mask_420_avx2_table + lea r7, [w_mask_420_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp +- vpbroadcastw m0, r7m ; sign ++ movd xm0, r7m ; sign + movsxd wq, dword [r7+wq*4] +- vpbroadcastd m6, [pw_8 +r7-w_mask_420_avx2_table] +- vpbroadcastd m7, [pw_26 +r7-w_mask_420_avx2_table] ; 64 - 38 +- vpbroadcastd m8, [pw_2048 +r7-w_mask_420_avx2_table] +- vpbroadcastd m9, [pw_258 +r7-w_mask_420_avx2_table] ; 64 * 4 + 2 +- pmovzxbd m10, [deint_shuf4+r7-w_mask_420_avx2_table] +- psubw m9, m0 ++ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 ++ vpbroadcastd m7, [base+pw_2048] ++ movd xm8, [base+pw_258] ; 64 * 4 + 2 ++ pmovzxbd m9, [base+deint_shuf4] ++ psubw xm8, xm0 + add wq, r7 +- W_MASK_420 0, 4 ++ vpbroadcastw m8, xm8 ++ W_MASK 0, 4 + lea stride3q, [strideq*3] + jmp wq + .w4: +@@ -3119,34 +3111,33 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 +- cmp hd, 4 +- je .w4_end ++ cmp hd, 8 ++ jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 +- cmp hd, 8 + jg .w4_h16 + .w4_end: + vextracti128 xm0, m4, 1 + vpblendd xm1, xm4, xm0, 0x05 + vpblendd xm4, xm4, xm0, 0x0a + pshufd xm1, xm1, q2301 +- psubw xm4, xm9, xm4 ++ psubw xm4, xm8, xm4 + psubw xm4, xm1 + psrlw xm4, 2 + packuswb xm4, xm4 + movq [maskq], xm4 + RET + .w4_h16: +- W_MASK_420 2, 5 ++ W_MASK 2, 5 + lea dstq, [dstq+strideq*4] + phaddd m4, m5 + vextracti128 xm1, m0, 1 +- psubw m4, m9, m4 ++ psubw m4, m8, m4 + psrlw m4, 2 +- vpermd m4, m10, m4 ++ vpermd m4, m9, m4 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + movd [dstq ], xm0 +@@ -3163,13 +3154,13 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 + .w8_loop: + add tmp1q, 2*32 + add tmp2q, 2*32 +- W_MASK_420 0, 4 ++ W_MASK 0, 4 + lea dstq, [dstq+strideq*4] + add maskq, 8 + .w8: + vextracti128 xm2, m4, 1 + vextracti128 xm1, m0, 1 +- psubw xm4, xm9, xm4 ++ psubw xm4, xm8, xm4 + psubw xm4, xm2 + psrlw xm4, 2 + packuswb xm4, xm4 +@@ -3184,22 +3175,22 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 + .w16_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 +- W_MASK_420 0, 4 ++ W_MASK 0, 4 + lea dstq, [dstq+strideq*4] + add maskq, 16 + .w16: + vpermq m0, m0, q3120 + mova [dstq ], xm0 + vextracti128 [dstq+strideq*1], m0, 1 +- W_MASK_420 2, 5 ++ W_MASK 2, 5 + punpckhqdq m1, m4, m5 + punpcklqdq m4, m5 +- psubw m1, m9, m1 ++ psubw m1, m8, m1 + psubw m1, m4 + psrlw m1, 2 + vpermq m0, m0, q3120 + packuswb m1, m1 +- vpermd m1, m10, m1 ++ vpermd m1, m9, m1 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + mova [maskq], xm1 +@@ -3209,101 +3200,406 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 + .w32_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 +- W_MASK_420 0, 4 ++ W_MASK 0, 4 + lea dstq, [dstq+strideq*2] + add maskq, 16 + .w32: + vpermq m0, m0, q3120 + mova [dstq], m0 +- W_MASK_420 2, 5 +- psubw m4, m9, m4 ++ W_MASK 2, 5 ++ psubw m4, m8, m4 + psubw m4, m5 + psrlw m4, 2 + vpermq m0, m0, q3120 + packuswb m4, m4 +- vpermd m4, m10, m4 ++ vpermd m4, m9, m4 + mova [dstq+strideq*1], m0 + mova [maskq], xm4 + sub hd, 2 + jg .w32_loop + RET + .w64_loop_even: +- psubw m11, m9, m4 +- psubw m12, m9, m5 ++ psubw m10, m8, m4 ++ psubw m11, m8, m5 + dec hd + .w64_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 +- W_MASK_420 0, 4 ++ W_MASK 0, 4 + add dstq, strideq + .w64: + vpermq m0, m0, q3120 + mova [dstq], m0 +- W_MASK_420 2, 5 ++ W_MASK 2, 5 + vpermq m0, m0, q3120 + mova [dstq+32], m0 + test hd, 1 + jz .w64_loop_even +- psubw m4, m11, m4 +- psubw m5, m12, m5 ++ psubw m4, m10, m4 ++ psubw m5, m11, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 +- vpermd m4, m10, m4 ++ vpermd m4, m9, m4 + mova [maskq], m4 + add maskq, 32 + dec hd + jg .w64_loop + RET + .w128_loop_even: +- psubw m13, m9, m4 +- psubw m14, m9, m5 ++ psubw m12, m8, m4 ++ psubw m13, m8, m5 + dec hd + .w128_loop: +- W_MASK_420 0, 4 ++ W_MASK 0, 4 + add dstq, strideq + .w128: + vpermq m0, m0, q3120 + mova [dstq+0*32], m0 +- W_MASK_420 2, 5 ++ W_MASK 2, 5 + vpermq m0, m0, q3120 + mova [dstq+1*32], m0 + add tmp1q, 8*32 + add tmp2q, 8*32 + test hd, 1 + jz .w128_even +- psubw m4, m11, m4 +- psubw m5, m12, m5 ++ psubw m4, m10, m4 ++ psubw m5, m11, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 +- vpermd m4, m10, m4 ++ vpermd m4, m9, m4 + mova [maskq], m4 + jmp .w128_odd + .w128_even: +- psubw m11, m9, m4 +- psubw m12, m9, m5 ++ psubw m10, m8, m4 ++ psubw m11, m8, m5 + .w128_odd: +- W_MASK_420 -4, 4 ++ W_MASK -4, 4 + vpermq m0, m0, q3120 + mova [dstq+2*32], m0 +- W_MASK_420 -2, 5 ++ W_MASK -2, 5 + vpermq m0, m0, q3120 + mova [dstq+3*32], m0 + test hd, 1 + jz .w128_loop_even +- psubw m4, m13, m4 +- psubw m5, m14, m5 ++ psubw m4, m12, m4 ++ psubw m5, m13, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 +- vpermd m4, m10, m4 ++ vpermd m4, m9, m4 + mova [maskq+32], m4 + add maskq, 64 + dec hd + jg .w128_loop + RET + ++cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-w_mask_422_avx2_table ++ lea r7, [w_mask_422_avx2_table] ++ tzcnt wd, wm ++ movifnidn hd, hm ++ mov maskq, maskmp ++ movd xm0, r7m ; sign ++ pxor m9, m9 ++ movsxd wq, dword [r7+wq*4] ++ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 ++ vpbroadcastd m7, [base+pw_2048] ++ pmovzxbd m10, [base+deint_shuf4] ++ add wq, r7 ++ psrlw xm8, xm7, 4 ; pw_128 ++ psubb xm8, xm0 ++ vpbroadcastb m8, xm8 ++ W_MASK 0, 4 ++ lea stride3q, [strideq*3] ++ jmp wq ++.w4: ++ vextracti128 xm1, m0, 1 ++ movd [dstq+strideq*0], xm0 ++ pextrd [dstq+strideq*1], xm0, 1 ++ movd [dstq+strideq*2], xm1 ++ pextrd [dstq+stride3q ], xm1, 1 ++ cmp hd, 8 ++ jl .w4_end ++ lea dstq, [dstq+strideq*4] ++ pextrd [dstq+strideq*0], xm0, 2 ++ pextrd [dstq+strideq*1], xm0, 3 ++ pextrd [dstq+strideq*2], xm1, 2 ++ pextrd [dstq+stride3q ], xm1, 3 ++ jg .w4_h16 ++.w4_end: ++ vextracti128 xm5, m4, 1 ++ packuswb xm4, xm5 ++ psubb xm5, xm8, xm4 ++ pavgb xm5, xm9 ++ pshufd xm5, xm5, q3120 ++ mova [maskq], xm5 ++ RET ++.w4_h16: ++ W_MASK 2, 5 ++ lea dstq, [dstq+strideq*4] ++ packuswb m4, m5 ++ psubb m5, m8, m4 ++ pavgb m5, m9 ++ vpermd m5, m10, m5 ++ vextracti128 xm1, m0, 1 ++ movd [dstq+strideq*0], xm0 ++ pextrd [dstq+strideq*1], xm0, 1 ++ movd [dstq+strideq*2], xm1 ++ pextrd [dstq+stride3q ], xm1, 1 ++ lea dstq, [dstq+strideq*4] ++ pextrd [dstq+strideq*0], xm0, 2 ++ pextrd [dstq+strideq*1], xm0, 3 ++ pextrd [dstq+strideq*2], xm1, 2 ++ pextrd [dstq+stride3q ], xm1, 3 ++ mova [maskq], m5 ++ RET ++.w8_loop: ++ add tmp1q, 32*2 ++ add tmp2q, 32*2 ++ W_MASK 0, 4 ++ lea dstq, [dstq+strideq*4] ++ add maskq, 16 ++.w8: ++ vextracti128 xm5, m4, 1 ++ vextracti128 xm1, m0, 1 ++ packuswb xm4, xm5 ++ psubb xm5, xm8, xm4 ++ pavgb xm5, xm9 ++ pshufd xm5, xm5, q3120 ++ movq [dstq+strideq*0], xm0 ++ movq [dstq+strideq*1], xm1 ++ movhps [dstq+strideq*2], xm0 ++ movhps [dstq+stride3q ], xm1 ++ mova [maskq], xm5 ++ sub hd, 4 ++ jg .w8_loop ++ RET ++.w16_loop: ++ add tmp1q, 32*4 ++ add tmp2q, 32*4 ++ W_MASK 0, 4 ++ lea dstq, [dstq+strideq*4] ++ add maskq, 32 ++.w16: ++ vpermq m0, m0, q3120 ++ mova [dstq+strideq*0], xm0 ++ vextracti128 [dstq+strideq*1], m0, 1 ++ W_MASK 2, 5 ++ packuswb m4, m5 ++ psubb m5, m8, m4 ++ pavgb m5, m9 ++ vpermq m0, m0, q3120 ++ vpermd m5, m10, m5 ++ mova [dstq+strideq*2], xm0 ++ vextracti128 [dstq+stride3q ], m0, 1 ++ mova [maskq], m5 ++ sub hd, 4 ++ jg .w16_loop ++ RET ++.w32_loop: ++ add tmp1q, 32*4 ++ add tmp2q, 32*4 ++ W_MASK 0, 4 ++ lea dstq, [dstq+strideq*2] ++ add maskq, 32 ++.w32: ++ vpermq m0, m0, q3120 ++ mova [dstq+strideq*0], m0 ++ W_MASK 2, 5 ++ packuswb m4, m5 ++ psubb m5, m8, m4 ++ pavgb m5, m9 ++ vpermq m0, m0, q3120 ++ vpermd m5, m10, m5 ++ mova [dstq+strideq*1], m0 ++ mova [maskq], m5 ++ sub hd, 2 ++ jg .w32_loop ++ RET ++.w64_loop: ++ add tmp1q, 32*4 ++ add tmp2q, 32*4 ++ W_MASK 0, 4 ++ add dstq, strideq ++ add maskq, 32 ++.w64: ++ vpermq m0, m0, q3120 ++ mova [dstq+32*0], m0 ++ W_MASK 2, 5 ++ packuswb m4, m5 ++ psubb m5, m8, m4 ++ pavgb m5, m9 ++ vpermq m0, m0, q3120 ++ vpermd m5, m10, m5 ++ mova [dstq+32*1], m0 ++ mova [maskq], m5 ++ dec hd ++ jg .w64_loop ++ RET ++.w128_loop: ++ add tmp1q, 32*8 ++ add tmp2q, 32*8 ++ W_MASK 0, 4 ++ add dstq, strideq ++ add maskq, 32*2 ++.w128: ++ vpermq m0, m0, q3120 ++ mova [dstq+32*0], m0 ++ W_MASK 2, 5 ++ packuswb m4, m5 ++ psubb m5, m8, m4 ++ pavgb m5, m9 ++ vpermq m0, m0, q3120 ++ vpermd m5, m10, m5 ++ mova [dstq+32*1], m0 ++ mova [maskq+32*0], m5 ++ W_MASK 4, 4 ++ vpermq m0, m0, q3120 ++ mova [dstq+32*2], m0 ++ W_MASK 6, 5 ++ packuswb m4, m5 ++ psubb m5, m8, m4 ++ pavgb m5, m9 ++ vpermq m0, m0, q3120 ++ vpermd m5, m10, m5 ++ mova [dstq+32*3], m0 ++ mova [maskq+32*1], m5 ++ dec hd ++ jg .w128_loop ++ RET ++ ++cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 ++%define base r7-w_mask_444_avx2_table ++ lea r7, [w_mask_444_avx2_table] ++ tzcnt wd, wm ++ movifnidn hd, hm ++ mov maskq, maskmp ++ movsxd wq, dword [r7+wq*4] ++ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 ++ vpbroadcastd m7, [base+pw_2048] ++ vpbroadcastd m5, [base+pb_64] ++ add wq, r7 ++ W_MASK 0, 4, 1 ++ lea stride3q, [strideq*3] ++ jmp wq ++.w4: ++ vextracti128 xm1, m0, 1 ++ movd [dstq+strideq*0], xm0 ++ pextrd [dstq+strideq*1], xm0, 1 ++ movd [dstq+strideq*2], xm1 ++ pextrd [dstq+stride3q ], xm1, 1 ++ mova [maskq+32*0], m4 ++ cmp hd, 8 ++ jl .w4_end ++ lea dstq, [dstq+strideq*4] ++ pextrd [dstq+strideq*0], xm0, 2 ++ pextrd [dstq+strideq*1], xm0, 3 ++ pextrd [dstq+strideq*2], xm1, 2 ++ pextrd [dstq+stride3q ], xm1, 3 ++ je .w4_end ++ W_MASK 2, 4, 1 ++ lea dstq, [dstq+strideq*4] ++ vextracti128 xm1, m0, 1 ++ movd [dstq+strideq*0], xm0 ++ pextrd [dstq+strideq*1], xm0, 1 ++ movd [dstq+strideq*2], xm1 ++ pextrd [dstq+stride3q ], xm1, 1 ++ lea dstq, [dstq+strideq*4] ++ pextrd [dstq+strideq*0], xm0, 2 ++ pextrd [dstq+strideq*1], xm0, 3 ++ pextrd [dstq+strideq*2], xm1, 2 ++ pextrd [dstq+stride3q ], xm1, 3 ++ mova [maskq+32*1], m4 ++.w4_end: ++ RET ++.w8_loop: ++ add tmp1q, 32*2 ++ add tmp2q, 32*2 ++ W_MASK 0, 4, 1 ++ lea dstq, [dstq+strideq*4] ++ add maskq, 32 ++.w8: ++ vextracti128 xm1, m0, 1 ++ movq [dstq+strideq*0], xm0 ++ movq [dstq+strideq*1], xm1 ++ movhps [dstq+strideq*2], xm0 ++ movhps [dstq+stride3q ], xm1 ++ mova [maskq], m4 ++ sub hd, 4 ++ jg .w8_loop ++ RET ++.w16_loop: ++ add tmp1q, 32*2 ++ add tmp2q, 32*2 ++ W_MASK 0, 4, 1 ++ lea dstq, [dstq+strideq*2] ++ add maskq, 32 ++.w16: ++ vpermq m0, m0, q3120 ++ mova [dstq+strideq*0], xm0 ++ vextracti128 [dstq+strideq*1], m0, 1 ++ mova [maskq], m4 ++ sub hd, 2 ++ jg .w16_loop ++ RET ++.w32_loop: ++ add tmp1q, 32*2 ++ add tmp2q, 32*2 ++ W_MASK 0, 4, 1 ++ add dstq, strideq ++ add maskq, 32 ++.w32: ++ vpermq m0, m0, q3120 ++ mova [dstq], m0 ++ mova [maskq], m4 ++ dec hd ++ jg .w32_loop ++ RET ++.w64_loop: ++ add tmp1q, 32*4 ++ add tmp2q, 32*4 ++ W_MASK 0, 4, 1 ++ add dstq, strideq ++ add maskq, 32*2 ++.w64: ++ vpermq m0, m0, q3120 ++ mova [dstq+32*0], m0 ++ mova [maskq+32*0], m4 ++ W_MASK 2, 4, 1 ++ vpermq m0, m0, q3120 ++ mova [dstq+32*1], m0 ++ mova [maskq+32*1], m4 ++ dec hd ++ jg .w64_loop ++ RET ++.w128_loop: ++ add tmp1q, 32*8 ++ add tmp2q, 32*8 ++ W_MASK 0, 4, 1 ++ add dstq, strideq ++ add maskq, 32*4 ++.w128: ++ vpermq m0, m0, q3120 ++ mova [dstq+32*0], m0 ++ mova [maskq+32*0], m4 ++ W_MASK 2, 4, 1 ++ vpermq m0, m0, q3120 ++ mova [dstq+32*1], m0 ++ mova [maskq+32*1], m4 ++ W_MASK 4, 4, 1 ++ vpermq m0, m0, q3120 ++ mova [dstq+32*2], m0 ++ mova [maskq+32*2], m4 ++ W_MASK 6, 4, 1 ++ vpermq m0, m0, q3120 ++ mova [dstq+32*3], m0 ++ mova [maskq+32*3], m4 ++ dec hd ++ jg .w128_loop ++ RET ++ + cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask + %define base r6-blend_avx2_table + lea r6, [blend_avx2_table] +diff --git third_party/dav1d/src/x86/mc_init_tmpl.c third_party/dav1d/src/x86/mc_init_tmpl.c +index 608e0c6da537..0e33cd4960b4 100644 +--- third_party/dav1d/src/x86/mc_init_tmpl.c ++++ third_party/dav1d/src/x86/mc_init_tmpl.c +@@ -29,27 +29,46 @@ + #include "src/mc.h" + + decl_mc_fn(dav1d_put_8tap_regular_avx2); ++decl_mc_fn(dav1d_put_8tap_regular_ssse3); + decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2); ++decl_mc_fn(dav1d_put_8tap_regular_smooth_ssse3); + decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2); ++decl_mc_fn(dav1d_put_8tap_regular_sharp_ssse3); + decl_mc_fn(dav1d_put_8tap_smooth_avx2); ++decl_mc_fn(dav1d_put_8tap_smooth_ssse3); + decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2); ++decl_mc_fn(dav1d_put_8tap_smooth_regular_ssse3); + decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2); ++decl_mc_fn(dav1d_put_8tap_smooth_sharp_ssse3); + decl_mc_fn(dav1d_put_8tap_sharp_avx2); ++decl_mc_fn(dav1d_put_8tap_sharp_ssse3); + decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2); ++decl_mc_fn(dav1d_put_8tap_sharp_regular_ssse3); + decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2); ++decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3); + decl_mc_fn(dav1d_put_bilin_avx2); + decl_mc_fn(dav1d_put_bilin_ssse3); + + decl_mct_fn(dav1d_prep_8tap_regular_avx2); ++decl_mct_fn(dav1d_prep_8tap_regular_ssse3); + decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2); ++decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3); + decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2); ++decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3); + decl_mct_fn(dav1d_prep_8tap_smooth_avx2); ++decl_mct_fn(dav1d_prep_8tap_smooth_ssse3); + decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2); ++decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3); + decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2); ++decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3); + decl_mct_fn(dav1d_prep_8tap_sharp_avx2); ++decl_mct_fn(dav1d_prep_8tap_sharp_ssse3); + decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2); ++decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3); + decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2); ++decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3); + decl_mct_fn(dav1d_prep_bilin_avx2); ++decl_mct_fn(dav1d_prep_bilin_ssse3); + + decl_avg_fn(dav1d_avg_avx2); + decl_avg_fn(dav1d_avg_ssse3); +@@ -59,6 +78,8 @@ decl_mask_fn(dav1d_mask_avx2); + decl_mask_fn(dav1d_mask_ssse3); + decl_w_mask_fn(dav1d_w_mask_420_avx2); + decl_w_mask_fn(dav1d_w_mask_420_ssse3); ++decl_w_mask_fn(dav1d_w_mask_422_avx2); ++decl_w_mask_fn(dav1d_w_mask_444_avx2); + decl_blend_fn(dav1d_blend_avx2); + decl_blend_fn(dav1d_blend_ssse3); + decl_blend_dir_fn(dav1d_blend_v_avx2); +@@ -85,6 +106,26 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + + #if BITDEPTH == 8 + init_mc_fn (FILTER_2D_BILINEAR, bilin, ssse3); ++ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); ++ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); ++ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); ++ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); ++ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); ++ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); ++ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); ++ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); ++ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); ++ ++ init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); ++ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); ++ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); ++ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); ++ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); ++ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); ++ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); ++ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); ++ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); ++ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + + c->avg = dav1d_avg_ssse3; + c->w_avg = dav1d_w_avg_ssse3; +@@ -125,6 +166,8 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { + c->avg = dav1d_avg_avx2; + c->w_avg = dav1d_w_avg_avx2; + c->mask = dav1d_mask_avx2; ++ c->w_mask[0] = dav1d_w_mask_444_avx2; ++ c->w_mask[1] = dav1d_w_mask_422_avx2; + c->w_mask[2] = dav1d_w_mask_420_avx2; + c->blend = dav1d_blend_avx2; + c->blend_v = dav1d_blend_v_avx2; +diff --git third_party/dav1d/src/x86/mc_ssse3.asm third_party/dav1d/src/x86/mc_ssse3.asm +index e9eafc56d40c..abca6cf6379e 100644 +--- third_party/dav1d/src/x86/mc_ssse3.asm ++++ third_party/dav1d/src/x86/mc_ssse3.asm +@@ -46,6 +46,11 @@ obmc_masks: db 0, 0, 0, 0 + db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 + db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 + ++subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 ++ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 ++subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 ++subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 ++subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 + bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 + blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +@@ -53,10 +58,19 @@ blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 + pb_64: times 16 db 64 + pw_8: times 8 dw 8 + pw_26: times 8 dw 26 +-pw_258: times 8 dw 258 ++pw_34: times 8 dw 34 + pw_512: times 8 dw 512 + pw_1024: times 8 dw 1024 + pw_2048: times 8 dw 2048 ++pw_6903: times 8 dw 6903 ++pw_8192: times 8 dw 8192 ++pd_32: times 4 dd 32 ++pd_512: times 4 dd 512 ++ ++pw_258: times 2 dw 258 ++ ++cextern mc_subpel_filters ++%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + + %macro BIDIR_JMP_TABLE 1-* + ;evaluated at definition time (in loop below) +@@ -90,8 +104,10 @@ BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16 + %endmacro + + %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put) ++%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep) + + BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 ++BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 + + %macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3) +@@ -125,7 +141,10 @@ BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 + %endif + %endmacro + ++HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 ++HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 + HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 ++HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 + + %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +@@ -134,13 +153,11 @@ SECTION .text + INIT_XMM ssse3 + + %if ARCH_X86_32 +-DECLARE_REG_TMP 1 +-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak +-%define base t0-put_ssse3 ++ DECLARE_REG_TMP 1 ++ %define base t0-put_ssse3 + %else +-DECLARE_REG_TMP 7 +-%define base 0 +-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy ++ DECLARE_REG_TMP 7 ++ %define base 0 + %endif + ; + %macro RESTORE_DSQ_32 1 +@@ -149,6 +166,7 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + %endif + %endmacro + ; ++cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak + movifnidn mxyd, r6m ; mx + LEA t0, put_ssse3 + tzcnt wd, wm +@@ -266,7 +284,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + imul mxyd, 0xff01 + mova m4, [base+bilin_h_shuf8] + mova m0, [base+bilin_h_shuf4] +- WIN64_SPILL_XMM 7 + add mxyd, 16 << 8 + movd m5, mxyd + mov mxyd, r7m ; my +@@ -275,7 +292,7 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + test mxyd, mxyd + jnz .hv + movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] +- mova m6, [base+pw_2048] ++ mova m3, [base+pw_2048] + add wq, t0 + RESTORE_DSQ_32 t0 + jmp wq +@@ -288,7 +305,7 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + punpckldq m0, m1 + pshufb m0, m4 + pmaddubsw m0, m5 +- pmulhrsw m0, m6 ++ pmulhrsw m0, m3 + packuswb m0, m0 + movd r6d, m0 + mov [dstq+dsq*0], r6w +@@ -304,10 +321,10 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + lea srcq, [srcq+ssq*2] + pshufb m4, m0 + pmaddubsw m4, m5 +- pmulhrsw m4, m6 ++ pmulhrsw m4, m3 + packuswb m4, m4 + movd [dstq+dsq*0], m4 +- pshufd m4, m4, q0101 ++ psrlq m4, 32 + movd [dstq+dsq*1], m4 + lea dstq, [dstq+dsq*2] + sub hd, 2 +@@ -321,8 +338,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- pmulhrsw m0, m6 +- pmulhrsw m1, m6 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 + packuswb m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 +@@ -338,8 +355,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- pmulhrsw m0, m6 +- pmulhrsw m1, m6 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq +@@ -349,25 +366,25 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + .h_w32: + movu m0, [srcq+mmsize*0+8*0] + movu m1, [srcq+mmsize*0+8*1] +- movu m2, [srcq+mmsize*1+8*0] +- movu m3, [srcq+mmsize*1+8*1] +- add srcq, ssq + pshufb m0, m4 + pshufb m1, m4 +- pshufb m2, m4 +- pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- pmaddubsw m2, m5 +- pmaddubsw m3, m5 +- pmulhrsw m0, m6 +- pmulhrsw m1, m6 +- pmulhrsw m2, m6 +- pmulhrsw m3, m6 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 + packuswb m0, m1 +- packuswb m2, m3 ++ movu m1, [srcq+mmsize*1+8*0] ++ movu m2, [srcq+mmsize*1+8*1] ++ add srcq, ssq ++ pshufb m1, m4 ++ pshufb m2, m4 ++ pmaddubsw m1, m5 ++ pmaddubsw m2, m5 ++ pmulhrsw m1, m3 ++ pmulhrsw m2, m3 ++ packuswb m1, m2 + mova [dstq+16*0], m0 +- mova [dstq+16*1], m2 ++ mova [dstq+16*1], m1 + add dstq, dsq + dec hd + jg .h_w32 +@@ -381,8 +398,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- pmulhrsw m0, m6 +- pmulhrsw m1, m6 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+r6+16*3], m0 + add r6, 16 +@@ -401,8 +418,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- pmulhrsw m0, m6 +- pmulhrsw m1, m6 ++ pmulhrsw m0, m3 ++ pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+r6+16*7], m0 + add r6, 16 +@@ -414,15 +431,13 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + RET + .v: + movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] +- %assign stack_offset stack_offset - stack_size_padded +- WIN64_SPILL_XMM 8 + imul mxyd, 0xff01 +- mova m7, [base+pw_2048] ++ mova m5, [base+pw_2048] + add mxyd, 16 << 8 + add wq, t0 +- movd m6, mxyd +- pshuflw m6, m6, q0000 +- punpcklqdq m6, m6 ++ movd m4, mxyd ++ pshuflw m4, m4, q0000 ++ punpcklqdq m4, m4 + RESTORE_DSQ_32 t0 + jmp wq + .v_w2: +@@ -433,8 +448,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + pshuflw m2, m0, q2301 + pinsrw m0, [srcq+ssq*0], 0 ; 2 1 + punpcklbw m1, m0, m2 +- pmaddubsw m1, m6 +- pmulhrsw m1, m7 ++ pmaddubsw m1, m4 ++ pmulhrsw m1, m5 + packuswb m1, m1 + movd r6d, m1 + mov [dstq+dsq*1], r6w +@@ -453,8 +468,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + movd m0, [srcq+ssq*0] + punpckldq m1, m0 ; 1 2 + punpcklbw m1, m2 +- pmaddubsw m1, m6 +- pmulhrsw m1, m7 ++ pmaddubsw m1, m4 ++ pmulhrsw m1, m5 + packuswb m1, m1 + movd [dstq+dsq*0], m1 + psrlq m1, 32 +@@ -467,20 +482,18 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + .v_w8: + movq m0, [srcq+ssq*0] + .v_w8_loop: +- movddup m2, [srcq+ssq*1] ++ movq m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +- punpcklqdq m3, m0, m2 ; 0 1 m2qh:m0ql +- movddup m0, [srcq+ssq*0] +- punpcklqdq m4, m2, m0 ; 1 2 m0qh:m2ql +- punpcklbw m1, m4, m3 +- punpckhbw m4, m3 +- pmaddubsw m1, m6 +- pmaddubsw m4, m6 +- pmulhrsw m1, m7 +- pmulhrsw m4, m7 +- packuswb m1, m4 +- movq [dstq+dsq*0], m1 +- movhps [dstq+dsq*1], m1 ++ punpcklbw m1, m3, m0 ++ movq m0, [srcq+ssq*0] ++ punpcklbw m2, m0, m3 ++ pmaddubsw m1, m4 ++ pmaddubsw m2, m4 ++ pmulhrsw m1, m5 ++ pmulhrsw m2, m5 ++ packuswb m1, m2 ++ movq [dstq+dsq*0], m1 ++ movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop +@@ -489,25 +502,25 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + %macro PUT_BILIN_V_W16 0 + movu m0, [srcq+ssq*0] + %%loop: +- movu m4, [srcq+ssq*1] ++ movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +- punpcklbw m1, m4, m0 +- punpckhbw m3, m4, m0 ++ punpcklbw m1, m3, m0 ++ punpckhbw m2, m3, m0 + movu m0, [srcq+ssq*0] +- punpcklbw m2, m0, m4 +- pmaddubsw m1, m6 +- pmaddubsw m3, m6 +- pmulhrsw m1, m7 +- pmulhrsw m3, m7 +- packuswb m1, m3 ++ pmaddubsw m1, m4 ++ pmaddubsw m2, m4 ++ pmulhrsw m1, m5 ++ pmulhrsw m2, m5 ++ packuswb m1, m2 + mova [dstq+dsq*0], m1 +- punpckhbw m3, m0, m4 +- pmaddubsw m2, m6 +- pmaddubsw m3, m6 +- pmulhrsw m2, m7 +- pmulhrsw m3, m7 +- packuswb m2, m3 +- mova [dstq+dsq*1], m2 ++ punpcklbw m1, m0, m3 ++ punpckhbw m2, m0, m3 ++ pmaddubsw m1, m4 ++ pmaddubsw m2, m4 ++ pmulhrsw m1, m5 ++ pmulhrsw m2, m5 ++ packuswb m1, m2 ++ mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg %%loop +@@ -549,7 +562,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 + ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 + movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] +- %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 8 + shl mxyd, 11 ; can't shift by 12 due to signed overflow + mova m7, [base+pw_2048] +@@ -579,10 +591,14 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x]) + pmulhrsw m1, m7 + packuswb m1, m1 ++%if ARCH_X86_64 ++ movq r6, m1 ++%else + pshuflw m1, m1, q2020 + movd r6d, m1 ++%endif + mov [dstq+dsq*0], r6w +- shr r6d, 16 ++ shr r6, gprsize*4 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 +@@ -595,9 +611,9 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + pshufb m0, m4 + pmaddubsw m0, m5 + .hv_w4_loop: +- movq m1, [srcq+ssq*1] +- lea srcq, [srcq+ssq*2] +- movhps m1, [srcq+ssq*0] ++ movq m1, [srcq+ssq*1] ++ lea srcq, [srcq+ssq*2] ++ movhps m1, [srcq+ssq*0] + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 + shufps m2, m0, m1, q1032 ; 0 1 +@@ -617,21 +633,21 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + RET + .hv_w8: + RESTORE_DSQ_32 t0 +- movu m0, [srcq+ssq*0+8*0] ++ movu m0, [srcq+ssq*0+8*0] + pshufb m0, m4 + pmaddubsw m0, m5 + .hv_w8_loop: +- movu m2, [srcq+ssq*1+8*0] +- lea srcq, [srcq+ssq*2] +- movu m3, [srcq+ssq*0+8*0] ++ movu m2, [srcq+ssq*1+8*0] ++ lea srcq, [srcq+ssq*2] + pshufb m2, m4 +- pshufb m3, m4 + pmaddubsw m2, m5 + psubw m1, m2, m0 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m0 +- pmaddubsw m0, m3, m5 ++ movu m0, [srcq+ssq*0+8*0] ++ pshufb m0, m4 ++ pmaddubsw m0, m5 + psubw m3, m0, m2 + paddw m3, m3 + pmulhw m3, m6 +@@ -639,79 +655,69 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + pmulhrsw m1, m7 + pmulhrsw m3, m7 + packuswb m1, m3 +- movq [dstq+dsq*0], m1 +- movhps [dstq+dsq*1], m1 ++ movq [dstq+dsq*0], m1 ++ movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +- ; +- ; 32bit has ssq, dsq free +-%macro PUT_BILIN_HV_W16 0 ++.hv_w16: ++ xor t0d, t0d ++.hv_w16gt: ++ mov r4, dstq ++ mov r6, srcq ++ %if WIN64 ++ movaps r4m, xmm8 ++ %endif ++.hv_w16_loop0: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +- %if WIN64 +- movaps r4m, xmm8 +- %endif +-%%loop: ++.hv_w16_loop: + %if ARCH_X86_32 +- %define m3back [dstq] +- %define dsqval dsm ++ %define m0tmp [dstq] + %else +- %define m3back m8 +- %define dsqval dsq ++ %define m0tmp m8 + %endif + add srcq, ssq +- movu m2, [srcq+8*1] ++ movu m2, [srcq+8*0] ++ movu m3, [srcq+8*1] + pshufb m2, m4 ++ pshufb m3, m4 + pmaddubsw m2, m5 +- psubw m3, m2, m1 ++ pmaddubsw m3, m5 ++ mova m0tmp, m2 ++ psubw m2, m0 ++ paddw m2, m2 ++ pmulhw m2, m6 ++ paddw m2, m0 ++ mova m0, m3 ++ psubw m3, m1 + paddw m3, m3 + pmulhw m3, m6 + paddw m3, m1 +- mova m1, m2 +- pmulhrsw m3, m7 +- mova m3back, m3 +- movu m2, [srcq+8*0] +- pshufb m2, m4 +- pmaddubsw m2, m5 +- psubw m3, m2, m0 +- paddw m3, m3 +- pmulhw m3, m6 +- paddw m3, m0 +- mova m0, m2 ++ mova m1, m0 ++ mova m0, m0tmp ++ pmulhrsw m2, m7 + pmulhrsw m3, m7 +- packuswb m3, m3back +- mova [dstq], m3 +- add dstq, dsqval ++ packuswb m2, m3 ++ mova [dstq], m2 ++ add dstq, dsmp + dec hd +- jg %%loop +- %if WIN64 +- movaps xmm8, r4m +- %endif +- %undef m3back +- %undef dsqval +-%endmacro +- ; +-.hv_w16: +- PUT_BILIN_HV_W16 +- RET +-.hv_w16gt: +- mov r4, dstq +- mov r6, srcq +-.hv_w16gt_loop: +- PUT_BILIN_HV_W16 +- mov hw, t0w ++ jg .hv_w16_loop ++ movzx hd, t0w + add r4, mmsize + add r6, mmsize + mov dstq, r4 + mov srcq, r6 + sub t0d, 1<<16 +- jg .hv_w16gt_loop ++ jg .hv_w16_loop0 ++ %if WIN64 ++ movaps xmm8, r4m ++ %endif + RET + .hv_w32: + lea t0d, [hq+(1<<16)] +@@ -723,112 +729,2694 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy + lea t0d, [hq+(7<<16)] + jmp .hv_w16gt + +-%if WIN64 +-DECLARE_REG_TMP 6, 4 ++DECLARE_REG_TMP 3, 5, 6 ++%if ARCH_X86_32 ++ %define base t2-prep_ssse3 + %else +-DECLARE_REG_TMP 6, 7 ++ %define base 0 + %endif +- +-%macro BIDIR_FN 1 ; op +- %1 0 ++cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 ++ movifnidn mxyd, r5m ; mx ++ LEA t2, prep_ssse3 ++ tzcnt wd, wm ++ movifnidn hd, hm ++ test mxyd, mxyd ++ jnz .h ++ mov mxyd, r6m ; my ++ test mxyd, mxyd ++ jnz .v ++.prep: ++ movzx wd, word [t2+wq*2+table_offset(prep,)] ++ add wq, t2 + lea stride3q, [strideq*3] + jmp wq +-.w4_loop: +- %1_INC_PTR 2 +- %1 0 +- lea dstq, [dstq+strideq*4] +-.w4: ; tile 4x +- movd [dstq ], m0 ; copy dw[0] +- pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] +- movd [dstq+strideq*1], m1 ; copy dw[1] +- punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] +- movd [dstq+strideq*2], m0 ; dw[2] +- psrlq m0, 32 ; shift right in dw[3] +- movd [dstq+stride3q ], m0 ; copy ++.prep_w4: ++ movd m0, [srcq+strideq*0] ++ movd m1, [srcq+strideq*1] ++ movd m2, [srcq+strideq*2] ++ movd m3, [srcq+stride3q ] ++ punpckldq m0, m1 ++ punpckldq m2, m3 ++ lea srcq, [srcq+strideq*4] ++ pxor m1, m1 ++ punpcklbw m0, m1 ++ punpcklbw m2, m1 ++ psllw m0, 4 ++ psllw m2, 4 ++ mova [tmpq+mmsize*0], m0 ++ mova [tmpq+mmsize*1], m2 ++ add tmpq, 32 + sub hd, 4 +- jg .w4_loop ++ jg .prep_w4 + RET +-.w8_loop: +- %1_INC_PTR 2 +- %1 0 +- lea dstq, [dstq+strideq*2] +-.w8: +- movq [dstq ], m0 +- movhps [dstq+strideq*1], m0 ++.prep_w8: ++ movq m0, [srcq+strideq*0] ++ movq m1, [srcq+strideq*1] ++ movq m2, [srcq+strideq*2] ++ movq m3, [srcq+stride3q ] ++ lea srcq, [srcq+strideq*4] ++ pxor m4, m4 ++ punpcklbw m0, m4 ++ punpcklbw m1, m4 ++ punpcklbw m2, m4 ++ punpcklbw m3, m4 ++ psllw m0, 4 ++ psllw m1, 4 ++ psllw m2, 4 ++ psllw m3, 4 ++ mova [tmpq+16*0], m0 ++ mova [tmpq+16*1], m1 ++ mova [tmpq+16*2], m2 ++ mova [tmpq+16*3], m3 ++ add tmpq, 16*4 ++ sub hd, 4 ++ jg .prep_w8 ++ RET ++.prep_w16: ++ movq m0, [srcq+strideq*0+8*0] ++ movq m1, [srcq+strideq*0+8*1] ++ movq m2, [srcq+strideq*1+8*0] ++ movq m3, [srcq+strideq*1+8*1] ++ lea srcq, [srcq+strideq*2] ++ pxor m4, m4 ++ punpcklbw m0, m4 ++ punpcklbw m1, m4 ++ punpcklbw m2, m4 ++ punpcklbw m3, m4 ++ psllw m0, 4 ++ psllw m1, 4 ++ psllw m2, 4 ++ psllw m3, 4 ++ mova [tmpq+16*0], m0 ++ mova [tmpq+16*1], m1 ++ mova [tmpq+16*2], m2 ++ mova [tmpq+16*3], m3 ++ add tmpq, 16*4 + sub hd, 2 +- jg .w8_loop ++ jg .prep_w16 + RET +-.w16_loop: +- %1_INC_PTR 2 +- %1 0 +- lea dstq, [dstq+strideq] +-.w16: +- mova [dstq ], m0 +- dec hd +- jg .w16_loop ++.prep_w16gt: ++ mov t1q, srcq ++ mov r3q, t2q ++.prep_w16gt_hloop: ++ movq m0, [t1q+8*0] ++ movq m1, [t1q+8*1] ++ movq m2, [t1q+8*2] ++ movq m3, [t1q+8*3] ++ pxor m4, m4 ++ punpcklbw m0, m4 ++ punpcklbw m1, m4 ++ punpcklbw m2, m4 ++ punpcklbw m3, m4 ++ psllw m0, 4 ++ psllw m1, 4 ++ psllw m2, 4 ++ psllw m3, 4 ++ mova [tmpq+16*0], m0 ++ mova [tmpq+16*1], m1 ++ mova [tmpq+16*2], m2 ++ mova [tmpq+16*3], m3 ++ add tmpq, 16*4 ++ add t1q, 32 ++ sub r3q, 1 ++ jg .prep_w16gt_hloop ++ lea srcq, [srcq+strideq] ++ sub hd, 1 ++ jg .prep_w16gt + RET +-.w32_loop: +- %1_INC_PTR 4 +- %1 0 +- lea dstq, [dstq+strideq] +-.w32: +- mova [dstq ], m0 +- %1 2 +- mova [dstq + 16 ], m0 +- dec hd +- jg .w32_loop ++.prep_w32: ++ mov t2q, 1 ++ jmp .prep_w16gt ++.prep_w64: ++ mov t2q, 2 ++ jmp .prep_w16gt ++.prep_w128: ++ mov t2q, 4 ++ jmp .prep_w16gt ++.h: ++ ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ++ ; = (16 - mx) * src[x] + mx * src[x + 1] ++ imul mxyd, 0xff01 ++ mova m4, [base+bilin_h_shuf8] ++ add mxyd, 16 << 8 ++ movd xm5, mxyd ++ mov mxyd, r6m ; my ++ pshuflw m5, m5, q0000 ++ punpcklqdq m5, m5 ++ test mxyd, mxyd ++ jnz .hv ++%if ARCH_X86_32 ++ mov t1, t2 ; save base reg for w4 ++%endif ++ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] ++ add wq, t2 ++ lea stride3q, [strideq*3] ++ jmp wq ++.h_w4: ++%if ARCH_X86_32 ++ mova m4, [t1-prep_ssse3+bilin_h_shuf4] ++%else ++ mova m4, [bilin_h_shuf4] ++%endif ++.h_w4_loop: ++ movq m0, [srcq+strideq*0] ++ movhps m0, [srcq+strideq*1] ++ movq m1, [srcq+strideq*2] ++ movhps m1, [srcq+stride3q ] ++ lea srcq, [srcq+strideq*4] ++ pshufb m0, m4 ++ pmaddubsw m0, m5 ++ pshufb m1, m4 ++ pmaddubsw m1, m5 ++ mova [tmpq+0 ], m0 ++ mova [tmpq+16], m1 ++ add tmpq, 32 ++ sub hd, 4 ++ jg .h_w4_loop + RET +-.w64_loop: +- %1_INC_PTR 8 +- %1 0 +- add dstq, strideq +-.w64: +- %assign i 0 +- %rep 4 +- mova [dstq + i*16 ], m0 +- %assign i i+1 +- %if i < 4 +- %1 2*i +- %endif +- %endrep +- dec hd +- jg .w64_loop ++.h_w8: ++ movu m0, [srcq+strideq*0] ++ movu m1, [srcq+strideq*1] ++ movu m2, [srcq+strideq*2] ++ movu m3, [srcq+stride3q ] ++ lea srcq, [srcq+strideq*4] ++ pshufb m0, m4 ++ pshufb m1, m4 ++ pshufb m2, m4 ++ pshufb m3, m4 ++ pmaddubsw m0, m5 ++ pmaddubsw m1, m5 ++ pmaddubsw m2, m5 ++ pmaddubsw m3, m5 ++ mova [tmpq+16*0], m0 ++ mova [tmpq+16*1], m1 ++ mova [tmpq+16*2], m2 ++ mova [tmpq+16*3], m3 ++ add tmpq, 16*4 ++ sub hd, 4 ++ jg .h_w8 + RET +-.w128_loop: +- %1_INC_PTR 16 +- %1 0 +- add dstq, strideq +-.w128: +- %assign i 0 +- %rep 8 +- mova [dstq + i*16 ], m0 +- %assign i i+1 +- %if i < 8 +- %1 2*i +- %endif +- %endrep +- dec hd +- jg .w128_loop ++.h_w16: ++ movu m0, [srcq+strideq*0+8*0] ++ movu m1, [srcq+strideq*0+8*1] ++ movu m2, [srcq+strideq*1+8*0] ++ movu m3, [srcq+strideq*1+8*1] ++ lea srcq, [srcq+strideq*2] ++ pshufb m0, m4 ++ pshufb m1, m4 ++ pshufb m2, m4 ++ pshufb m3, m4 ++ pmaddubsw m0, m5 ++ pmaddubsw m1, m5 ++ pmaddubsw m2, m5 ++ pmaddubsw m3, m5 ++ mova [tmpq+16*0], m0 ++ mova [tmpq+16*1], m1 ++ mova [tmpq+16*2], m2 ++ mova [tmpq+16*3], m3 ++ add tmpq, 16*4 ++ sub hd, 2 ++ jg .h_w16 + RET +-%endmacro +- +-%macro AVG 1 ; src_offset +- ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel +- mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 +- paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 +- mova m1, [tmp1q+(%1+1)*mmsize] +- paddw m1, [tmp2q+(%1+1)*mmsize] +- pmulhrsw m0, m2 +- pmulhrsw m1, m2 +- packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit +-%endmacro +- +-%macro AVG_INC_PTR 1 +- add tmp1q, %1*mmsize +- add tmp2q, %1*mmsize +-%endmacro +- +-cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 ++.h_w16gt: ++ mov t1q, srcq ++ mov r3q, t2q ++.h_w16gt_hloop: ++ movu m0, [t1q+8*0] ++ movu m1, [t1q+8*1] ++ movu m2, [t1q+8*2] ++ movu m3, [t1q+8*3] ++ pshufb m0, m4 ++ pshufb m1, m4 ++ pshufb m2, m4 ++ pshufb m3, m4 ++ pmaddubsw m0, m5 ++ pmaddubsw m1, m5 ++ pmaddubsw m2, m5 ++ pmaddubsw m3, m5 ++ mova [tmpq+16*0], m0 ++ mova [tmpq+16*1], m1 ++ mova [tmpq+16*2], m2 ++ mova [tmpq+16*3], m3 ++ add tmpq, 16*4 ++ add t1q, 32 ++ sub r3q, 1 ++ jg .h_w16gt_hloop ++ lea srcq, [srcq+strideq] ++ sub hd, 1 ++ jg .h_w16gt ++ RET ++.h_w32: ++ mov t2q, 1 ++ jmp .h_w16gt ++.h_w64: ++ mov t2q, 2 ++ jmp .h_w16gt ++.h_w128: ++ mov t2q, 4 ++ jmp .h_w16gt ++.v: ++ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] ++ imul mxyd, 0xff01 ++ add mxyd, 16 << 8 ++ add wq, t2 ++ lea stride3q, [strideq*3] ++ movd m5, mxyd ++ pshuflw m5, m5, q0000 ++ punpcklqdq m5, m5 ++ jmp wq ++.v_w4: ++ movd m0, [srcq+strideq*0] ++.v_w4_loop: ++ movd m1, [srcq+strideq*1] ++ movd m2, [srcq+strideq*2] ++ movd m3, [srcq+stride3q ] ++ lea srcq, [srcq+strideq*4] ++ punpcklwd m0, m1 ; 0 1 _ _ ++ punpcklwd m1, m2 ; 1 2 _ _ ++ punpcklbw m1, m0 ++ pmaddubsw m1, m5 ++ pshufd m1, m1, q3120 ++ mova [tmpq+16*0], m1 ++ movd m0, [srcq+strideq*0] ++ punpcklwd m2, m3 ; 2 3 _ _ ++ punpcklwd m3, m0 ; 3 4 _ _ ++ punpcklbw m3, m2 ++ pmaddubsw m3, m5 ++ pshufd m3, m3, q3120 ++ mova [tmpq+16*1], m3 ++ add tmpq, 32 ++ sub hd, 4 ++ jg .v_w4_loop ++ RET ++.v_w8: ++ movq m0, [srcq+strideq*0] ++.v_w8_loop: ++ movq m1, [srcq+strideq*2] ++ movq m2, [srcq+strideq*1] ++ movq m3, [srcq+stride3q ] ++ lea srcq, [srcq+strideq*4] ++ shufpd m4, m0, m1, 0x0c ; 0 2 ++ movq m0, [srcq+strideq*0] ++ shufpd m2, m3, 0x0c ; 1 3 ++ shufpd m1, m0, 0x0c ; 2 4 ++ punpcklbw m3, m2, m4 ++ pmaddubsw m3, m5 ++ mova [tmpq+16*0], m3 ++ punpckhbw m3, m2, m4 ++ pmaddubsw m3, m5 ++ mova [tmpq+16*2], m3 ++ punpcklbw m3, m1, m2 ++ punpckhbw m1, m2 ++ pmaddubsw m3, m5 ++ pmaddubsw m1, m5 ++ mova [tmpq+16*1], m3 ++ mova [tmpq+16*3], m1 ++ add tmpq, 16*4 ++ sub hd, 4 ++ jg .v_w8_loop ++ RET ++.v_w16: ++ movu m0, [srcq+strideq*0] ++.v_w16_loop: ++ movu m1, [srcq+strideq*1] ++ movu m2, [srcq+strideq*2] ++ punpcklbw m3, m1, m0 ++ punpckhbw m4, m1, m0 ++ pmaddubsw m3, m5 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*0], m3 ++ mova [tmpq+16*1], m4 ++ punpcklbw m3, m2, m1 ++ punpckhbw m4, m2, m1 ++ pmaddubsw m3, m5 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*2], m3 ++ mova [tmpq+16*3], m4 ++ movu m3, [srcq+stride3q ] ++ lea srcq, [srcq+strideq*4] ++ movu m0, [srcq+strideq*0] ++ add tmpq, 16*8 ++ punpcklbw m1, m3, m2 ++ punpckhbw m4, m3, m2 ++ pmaddubsw m1, m5 ++ pmaddubsw m4, m5 ++ mova [tmpq-16*4], m1 ++ mova [tmpq-16*3], m4 ++ punpcklbw m1, m0, m3 ++ punpckhbw m2, m0, m3 ++ pmaddubsw m1, m5 ++ pmaddubsw m2, m5 ++ mova [tmpq-16*2], m1 ++ mova [tmpq-16*1], m2 ++ sub hd, 4 ++ jg .v_w16_loop ++ RET ++.v_w32: ++ lea t2d, [hq+(0<<16)] ++ mov t0d, 64 ++.v_w32_start: ++%if ARCH_X86_64 ++ %if WIN64 ++ PUSH r7 ++ %endif ++ mov r7, tmpq ++%endif ++ mov t1, srcq ++.v_w32_loop_h: ++ movu m0, [srcq+strideq*0+16*0] ; 0L ++ movu m1, [srcq+strideq*0+16*1] ; 0U ++.v_w32_loop_v: ++ movu m2, [srcq+strideq*1+16*0] ; 1L ++ movu m3, [srcq+strideq*1+16*1] ; 1U ++ lea srcq, [srcq+strideq*2] ++ punpcklbw m4, m2, m0 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*0], m4 ++ punpckhbw m4, m2, m0 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*1], m4 ++ punpcklbw m4, m3, m1 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*2], m4 ++ punpckhbw m4, m3, m1 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*3], m4 ++ add tmpq, t0q ++ movu m0, [srcq+strideq*0+16*0] ; 2L ++ movu m1, [srcq+strideq*0+16*1] ; 2U ++ punpcklbw m4, m0, m2 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*0], m4 ++ punpckhbw m4, m0, m2 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*1], m4 ++ punpcklbw m4, m1, m3 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*2], m4 ++ punpckhbw m4, m1, m3 ++ pmaddubsw m4, m5 ++ mova [tmpq+16*3], m4 ++ add tmpq, t0q ++ sub hd, 2 ++ jg .v_w32_loop_v ++ movzx hd, t2w ++ add t1, 32 ++ mov srcq, t1 ++%if ARCH_X86_64 ++ add r7, 2*16*2 ++ mov tmpq, r7 ++%else ++ mov tmpq, tmpmp ++ add tmpq, 2*16*2 ++ mov tmpmp, tmpq ++%endif ++ sub t2d, 1<<16 ++ jg .v_w32_loop_h ++%if WIN64 ++ POP r7 ++%endif ++ RET ++.v_w64: ++ lea t2d, [hq+(1<<16)] ++ mov t0d, 128 ++ jmp .v_w32_start ++.v_w128: ++ lea t2d, [hq+(3<<16)] ++ mov t0d, 256 ++ jmp .v_w32_start ++.hv: ++ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ++ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) ++ %assign stack_offset stack_offset - stack_size_padded ++ WIN64_SPILL_XMM 8 ++ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] ++ shl mxyd, 11 ++ movd xm6, mxyd ++ add wq, t2 ++ pshuflw m6, m6, q0000 ++ punpcklqdq m6, m6 ++%if ARCH_X86_32 ++ mov t1, t2 ; save base reg for w4 ++%endif ++ lea stride3q, [strideq*3] ++ jmp wq ++.hv_w4: ++%if ARCH_X86_32 ++ mova m4, [t1-prep_ssse3+bilin_h_shuf4] ++%else ++ mova m4, [bilin_h_shuf4] ++%endif ++ movq m0, [srcq+strideq*0] ; 0 _ ++ punpcklqdq m0, m0 ++ pshufb m0, m4 ++ pmaddubsw m0, m5 ++.hv_w4_loop: ++ movq m1, [srcq+strideq*1] ++ movhps m1, [srcq+strideq*2] ; 1 _ 2 _ ++ movq m2, [srcq+stride3q ] ++ lea srcq, [srcq+strideq*4] ++ movhps m2, [srcq+strideq*0] ; 3 _ 4 _ ++ pshufb m1, m4 ++ pshufb m2, m4 ++ pmaddubsw m1, m5 ; 1 + 2 + ++ shufpd m3, m0, m1, 0x01 ; 0 + 1 + ++ pmaddubsw m0, m2, m5 ; 3 + 4 + ++ shufpd m2, m1, m0, 0x01 ; 2 + 3 + ++ psubw m1, m3 ++ pmulhrsw m1, m6 ++ paddw m1, m3 ++ psubw m3, m0, m2 ++ pmulhrsw m3, m6 ++ paddw m3, m2 ++ mova [tmpq+16*0], m1 ++ mova [tmpq+16*1], m3 ++ add tmpq, 32 ++ sub hd, 4 ++ jg .hv_w4_loop ++ RET ++.hv_w8: ++ movu m0, [srcq+strideq*0] ++ pshufb m0, m4 ++ pmaddubsw m0, m5 ; 0 + ++.hv_w8_loop: ++ movu m1, [srcq+strideq*1] ; 1 ++ movu m2, [srcq+strideq*2] ; 2 ++ pshufb m1, m4 ++ pshufb m2, m4 ++ pmaddubsw m1, m5 ; 1 + ++ pmaddubsw m2, m5 ; 2 + ++ psubw m3, m1, m0 ; 1-0 ++ pmulhrsw m3, m6 ++ paddw m3, m0 ++ psubw m7, m2, m1 ; 2-1 ++ pmulhrsw m7, m6 ++ paddw m7, m1 ++ mova [tmpq+16*0], m3 ++ mova [tmpq+16*1], m7 ++ movu m1, [srcq+stride3q ] ; 3 ++ lea srcq, [srcq+strideq*4] ++ movu m0, [srcq+strideq*0] ; 4 ++ pshufb m1, m4 ++ pshufb m0, m4 ++ pmaddubsw m1, m5 ; 3 + ++ pmaddubsw m0, m5 ; 4 + ++ psubw m3, m1, m2 ; 3-2 ++ pmulhrsw m3, m6 ++ paddw m3, m2 ++ psubw m7, m0, m1 ; 4-3 ++ pmulhrsw m7, m6 ++ paddw m7, m1 ++ mova [tmpq+16*2], m3 ++ mova [tmpq+16*3], m7 ++ add tmpq, 16*4 ++ sub hd, 4 ++ jg .hv_w8_loop ++ RET ++.hv_w16: ++ lea t2d, [hq+(0<<16)] ++ mov t0d, 32 ++.hv_w16_start: ++%if ARCH_X86_64 ++ %if WIN64 ++ PUSH r7 ++ %endif ++ mov r7, tmpq ++%endif ++ mov t1, srcq ++.hv_w16_loop_h: ++ movu m0, [srcq+strideq*0+8*0] ; 0L ++ movu m1, [srcq+strideq*0+8*1] ; 0U ++ pshufb m0, m4 ++ pshufb m1, m4 ++ pmaddubsw m0, m5 ; 0L + ++ pmaddubsw m1, m5 ; 0U + ++.hv_w16_loop_v: ++ movu m2, [srcq+strideq*1+8*0] ; 1L ++ pshufb m2, m4 ++ pmaddubsw m2, m5 ; 1L + ++ psubw m3, m2, m0 ; 1L-0L ++ pmulhrsw m3, m6 ++ paddw m3, m0 ++ mova [tmpq+16*0], m3 ++ movu m3, [srcq+strideq*1+8*1] ; 1U ++ lea srcq, [srcq+strideq*2] ++ pshufb m3, m4 ++ pmaddubsw m3, m5 ; 1U + ++ psubw m0, m3, m1 ; 1U-0U ++ pmulhrsw m0, m6 ++ paddw m0, m1 ++ mova [tmpq+16*1], m0 ++ add tmpq, t0q ++ movu m0, [srcq+strideq*0+8*0] ; 2L ++ pshufb m0, m4 ++ pmaddubsw m0, m5 ; 2L + ++ psubw m1, m0, m2 ; 2L-1L ++ pmulhrsw m1, m6 ++ paddw m1, m2 ++ mova [tmpq+16*0], m1 ++ movu m1, [srcq+strideq*0+8*1] ; 2U ++ pshufb m1, m4 ++ pmaddubsw m1, m5 ; 2U + ++ psubw m2, m1, m3 ; 2U-1U ++ pmulhrsw m2, m6 ++ paddw m2, m3 ++ mova [tmpq+16*1], m2 ++ add tmpq, t0q ++ sub hd, 2 ++ jg .hv_w16_loop_v ++ movzx hd, t2w ++ add t1, 16 ++ mov srcq, t1 ++%if ARCH_X86_64 ++ add r7, 2*16 ++ mov tmpq, r7 ++%else ++ mov tmpq, tmpmp ++ add tmpq, 2*16 ++ mov tmpmp, tmpq ++%endif ++ sub t2d, 1<<16 ++ jg .hv_w16_loop_h ++%if WIN64 ++ POP r7 ++%endif ++ RET ++.hv_w32: ++ lea t2d, [hq+(1<<16)] ++ mov t0d, 64 ++ jmp .hv_w16_start ++.hv_w64: ++ lea t2d, [hq+(3<<16)] ++ mov t0d, 128 ++ jmp .hv_w16_start ++.hv_w128: ++ lea t2d, [hq+(7<<16)] ++ mov t0d, 256 ++ jmp .hv_w16_start ++ ++; int8_t subpel_filters[5][15][8] ++%assign FILTER_REGULAR (0*15 << 16) | 3*15 ++%assign FILTER_SMOOTH (1*15 << 16) | 4*15 ++%assign FILTER_SHARP (2*15 << 16) | 3*15 ++ ++%if ARCH_X86_32 ++DECLARE_REG_TMP 1, 2 ++%elif WIN64 ++DECLARE_REG_TMP 4, 5 ++%else ++DECLARE_REG_TMP 7, 8 ++%endif ++ ++%macro PUT_8TAP_FN 3 ; type, type_h, type_v ++cglobal put_8tap_%1 ++ mov t0d, FILTER_%2 ++ mov t1d, FILTER_%3 ++%ifnidn %1, sharp_smooth ; skip the jump in the last filter ++ jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX) ++%endif ++%endmacro ++ ++PUT_8TAP_FN regular, REGULAR, REGULAR ++PUT_8TAP_FN regular_sharp, REGULAR, SHARP ++PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH ++PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR ++PUT_8TAP_FN smooth, SMOOTH, SMOOTH ++PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP ++PUT_8TAP_FN sharp_regular, SHARP, REGULAR ++PUT_8TAP_FN sharp, SHARP, SHARP ++PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH ++ ++%if ARCH_X86_32 ++ %define base_reg r1 ++ %define base base_reg-put_ssse3 ++ %define W32_RESTORE_DSQ mov dsq, dsm ++ %define W32_RESTORE_SSQ mov ssq, ssm ++%else ++ %define base_reg r8 ++ %define base 0 ++ %define W32_RESTORE_DSQ ++ %define W32_RESTORE_SSQ ++%endif ++ ++cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 ++%assign org_stack_offset stack_offset ++ imul mxd, mxm, 0x010101 ++ add mxd, t0d ; 8tap_h, mx, 4tap_h ++%if ARCH_X86_64 ++ imul myd, mym, 0x010101 ++ add myd, t1d ; 8tap_v, my, 4tap_v ++%else ++ imul ssd, mym, 0x010101 ++ add ssd, t1d ; 8tap_v, my, 4tap_v ++ mov srcq, srcm ++%endif ++ mov wd, wm ++ movifnidn hd, hm ++ LEA base_reg, put_ssse3 ++ test mxd, 0xf00 ++ jnz .h ++%if ARCH_X86_32 ++ test ssd, 0xf00 ++%else ++ test myd, 0xf00 ++%endif ++ jnz .v ++ tzcnt wd, wd ++ movzx wd, word [base_reg+wq*2+table_offset(put,)] ++ add wq, base_reg ++; put_bilin mangling jump ++%assign stack_offset org_stack_offset ++%if ARCH_X86_32 ++ mov dsq, dsm ++ mov ssq, ssm ++%elif WIN64 ++ pop r8 ++%endif ++ lea r6, [ssq*3] ++ jmp wq ++.h: ++%if ARCH_X86_32 ++ test ssd, 0xf00 ++%else ++ test myd, 0xf00 ++%endif ++ jnz .hv ++ W32_RESTORE_SSQ ++ WIN64_SPILL_XMM 12 ++ cmp wd, 4 ++ jl .h_w2 ++ je .h_w4 ++ tzcnt wd, wd ++%if ARCH_X86_64 ++ mova m10, [base+subpel_h_shufA] ++ mova m11, [base+subpel_h_shufB] ++ mova m9, [base+subpel_h_shufC] ++%endif ++ shr mxd, 16 ++ sub srcq, 3 ++ movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] ++ movd m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0] ++ pshufd m5, m5, q0000 ++ movd m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4] ++ pshufd m6, m6, q0000 ++ mova m7, [base+pw_34] ; 2 + (8 << 2) ++ add wq, base_reg ++ jmp wq ++.h_w2: ++%if ARCH_X86_32 ++ and mxd, 0xff ++%else ++ movzx mxd, mxb ++%endif ++ dec srcq ++ mova m4, [base+subpel_h_shuf4] ++ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] ++ pshufd m3, m3, q0000 ++ mova m5, [base+pw_34] ; 2 + (8 << 2) ++ W32_RESTORE_DSQ ++.h_w2_loop: ++ movq m0, [srcq+ssq*0] ++ movhps m0, [srcq+ssq*1] ++ lea srcq, [srcq+ssq*2] ++ pshufb m0, m4 ++ pmaddubsw m0, m3 ++ phaddw m0, m0 ++ paddw m0, m5 ; pw34 ++ psraw m0, 6 ++ packuswb m0, m0 ++ movd r4d, m0 ++ mov [dstq+dsq*0], r4w ++ shr r4d, 16 ++ mov [dstq+dsq*1], r4w ++ lea dstq, [dstq+dsq*2] ++ sub hd, 2 ++ jg .h_w2_loop ++ RET ++.h_w4: ++%if ARCH_X86_32 ++ and mxd, 0xff ++%else ++ movzx mxd, mxb ++%endif ++ dec srcq ++ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] ++ pshufd m3, m3, q0000 ++ mova m5, [base+pw_34] ; 2 + (8 << 2) ++ mova m6, [base+subpel_h_shufA] ++ W32_RESTORE_DSQ ++.h_w4_loop: ++ movq m0, [srcq+ssq*0] ; 1 ++ movq m1, [srcq+ssq*1] ; 2 ++ lea srcq, [srcq+ssq*2] ++ pshufb m0, m6 ; subpel_h_shufA ++ pshufb m1, m6 ; subpel_h_shufA ++ pmaddubsw m0, m3 ; subpel_filters ++ pmaddubsw m1, m3 ; subpel_filters ++ phaddw m0, m1 ++ paddw m0, m5 ; pw34 ++ psraw m0, 6 ++ packuswb m0, m0 ++ movd [dstq+dsq*0], m0 ++ psrlq m0, 32 ++ movd [dstq+dsq*1], m0 ++ lea dstq, [dstq+dsq*2] ++ sub hd, 2 ++ jg .h_w4_loop ++ RET ++ ; ++%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] ++ %if ARCH_X86_32 ++ pshufb %2, %1, [base+subpel_h_shufB] ++ pshufb %3, %1, [base+subpel_h_shufC] ++ pshufb %1, [base+subpel_h_shufA] ++ %else ++ pshufb %2, %1, m11; subpel_h_shufB ++ pshufb %3, %1, m9 ; subpel_h_shufC ++ pshufb %1, m10 ; subpel_h_shufA ++ %endif ++ pmaddubsw %4, %2, m5 ; subpel +0 B0 ++ pmaddubsw %2, m6 ; subpel +4 B4 ++ pmaddubsw %3, m6 ; C4 ++ pmaddubsw %1, m5 ; A0 ++ paddw %3, %4 ; C4+B0 ++ paddw %1, %2 ; A0+B4 ++ phaddw %1, %3 ++ paddw %1, m7 ; pw34 ++ psraw %1, 6 ++%endmacro ++ ; ++.h_w8: ++ movu m0, [srcq+ssq*0] ++ movu m1, [srcq+ssq*1] ++ PUT_8TAP_H m0, m2, m3, m4 ++ lea srcq, [srcq+ssq*2] ++ PUT_8TAP_H m1, m2, m3, m4 ++ packuswb m0, m1 ++%if ARCH_X86_32 ++ movq [dstq ], m0 ++ add dstq, dsm ++ movhps [dstq ], m0 ++ add dstq, dsm ++%else ++ movq [dstq+dsq*0], m0 ++ movhps [dstq+dsq*1], m0 ++ lea dstq, [dstq+dsq*2] ++%endif ++ sub hd, 2 ++ jg .h_w8 ++ RET ++.h_w16: ++ xor r6d, r6d ++ jmp .h_start ++.h_w32: ++ mov r6, -16*1 ++ jmp .h_start ++.h_w64: ++ mov r6, -16*3 ++ jmp .h_start ++.h_w128: ++ mov r6, -16*7 ++.h_start: ++ sub srcq, r6 ++ sub dstq, r6 ++ mov r4, r6 ++.h_loop: ++ movu m0, [srcq+r6+8*0] ++ movu m1, [srcq+r6+8*1] ++ PUT_8TAP_H m0, m2, m3, m4 ++ PUT_8TAP_H m1, m2, m3, m4 ++ packuswb m0, m1 ++ mova [dstq+r6], m0 ++ add r6, mmsize ++ jle .h_loop ++ add srcq, ssq ++%if ARCH_X86_32 ++ add dstq, dsm ++%else ++ add dstq, dsq ++%endif ++ mov r6, r4 ++ dec hd ++ jg .h_loop ++ RET ++.v: ++%if ARCH_X86_32 ++ movzx mxd, ssb ++ shr ssd, 16 ++ cmp hd, 4 ++ cmovle ssd, mxd ++ lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3] ++%else ++ %assign stack_offset org_stack_offset ++ WIN64_SPILL_XMM 16 ++ movzx mxd, myb ++ shr myd, 16 ++ cmp hd, 4 ++ cmovle myd, mxd ++ lea myq, [base_reg+myq*8+subpel_filters-put_ssse3] ++%endif ++ tzcnt r6d, wd ++ movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] ++ mova m7, [base+pw_512] ++ psrlw m2, m7, 1 ; 0x0100 ++ add r6, base_reg ++%if ARCH_X86_32 ++ %define subpel0 [rsp+mmsize*0] ++ %define subpel1 [rsp+mmsize*1] ++ %define subpel2 [rsp+mmsize*2] ++ %define subpel3 [rsp+mmsize*3] ++%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed ++ ALLOC_STACK -mmsize*4 ++%assign regs_used 7 ++ movd m0, [ssq+0] ++ pshufb m0, m2 ++ mova subpel0, m0 ++ movd m0, [ssq+2] ++ pshufb m0, m2 ++ mova subpel1, m0 ++ movd m0, [ssq+4] ++ pshufb m0, m2 ++ mova subpel2, m0 ++ movd m0, [ssq+6] ++ pshufb m0, m2 ++ mova subpel3, m0 ++ mov ssq, [rstk+stack_offset+gprsize*4] ++ lea ssq, [ssq*3] ++ sub srcq, ssq ++ mov ssq, [rstk+stack_offset+gprsize*4] ++ mov dsq, [rstk+stack_offset+gprsize*2] ++%else ++ %define subpel0 m8 ++ %define subpel1 m9 ++ %define subpel2 m10 ++ %define subpel3 m11 ++ movd subpel0, [myq+0] ++ pshufb subpel0, m2 ++ movd subpel1, [myq+2] ++ pshufb subpel1, m2 ++ movd subpel2, [myq+4] ++ pshufb subpel2, m2 ++ movd subpel3, [myq+6] ++ pshufb subpel3, m2 ++ lea ss3q, [ssq*3] ++ sub srcq, ss3q ++%endif ++ jmp r6 ++.v_w2: ++ movd m2, [srcq+ssq*0] ; 0 ++ pinsrw m2, [srcq+ssq*1], 2 ; 0 1 ++ pinsrw m2, [srcq+ssq*2], 4 ; 0 1 2 ++%if ARCH_X86_32 ++ lea srcq, [srcq+ssq*2] ++ add srcq, ssq ++ pinsrw m2, [srcq+ssq*0], 6 ; 0 1 2 3 ++ add srcq, ssq ++%else ++ pinsrw m2, [srcq+ss3q ], 6 ; 0 1 2 3 ++ lea srcq, [srcq+ssq*4] ++%endif ++ movd m3, [srcq+ssq*0] ; 4 ++ movd m1, [srcq+ssq*1] ; 5 ++ movd m0, [srcq+ssq*2] ; 6 ++%if ARCH_X86_32 ++ lea srcq, [srcq+ssq*2] ++ add srcq, ssq ++%else ++ add srcq, ss3q ++%endif ++ punpckldq m3, m1 ; 4 5 _ _ ++ punpckldq m1, m0 ; 5 6 _ _ ++ palignr m4, m3, m2, 4 ; 1 2 3 4 ++ punpcklbw m3, m1 ; 45 56 ++ punpcklbw m1, m2, m4 ; 01 12 ++ punpckhbw m2, m4 ; 23 34 ++.v_w2_loop: ++ pmaddubsw m5, m1, subpel0 ; a0 b0 ++ mova m1, m2 ++ pmaddubsw m2, subpel1 ; a1 b1 ++ paddw m5, m2 ++ mova m2, m3 ++ pmaddubsw m3, subpel2 ; a2 b2 ++ paddw m5, m3 ++ movd m4, [srcq+ssq*0] ; 7 ++ punpckldq m3, m0, m4 ; 6 7 _ _ ++ movd m0, [srcq+ssq*1] ++ lea srcq, [srcq+ssq*2] ++ punpckldq m4, m0 ; 7 8 _ _ ++ punpcklbw m3, m4 ; 67 78 ++ pmaddubsw m4, m3, subpel3 ; a3 b3 ++ paddw m5, m4 ++ pmulhrsw m5, m7 ++ packuswb m5, m5 ++ pshuflw m5, m5, q2020 ++ movd r6d, m5 ++ mov [dstq+dsq*0], r6w ++ shr r6d, 16 ++ mov [dstq+dsq*1], r6w ++ lea dstq, [dstq+dsq*2] ++ sub hd, 2 ++ jg .v_w2_loop ++ RET ++.v_w4: ++%if ARCH_X86_32 ++.v_w8: ++.v_w16: ++.v_w32: ++.v_w64: ++.v_w128: ++%endif ; ARCH_X86_32 ++ lea r6d, [wq - 4] ; horizontal loop ++ mov r4, dstq ++%if ARCH_X86_32 ++%if STACK_ALIGNMENT < mmsize ++ %define srcm [rsp+mmsize*4+gprsize] ++%endif ++ mov srcm, srcq ++%else ++ mov r7, srcq ++%endif ++ shl r6d, (16 - 2) ; (wq / 4) << 16 ++ mov r6w, hw ++.v_w4_loop0: ++ movd m2, [srcq+ssq*0] ; 0 ++ movhps m2, [srcq+ssq*2] ; 0 _ 2 ++ movd m3, [srcq+ssq*1] ; 1 ++%if ARCH_X86_32 ++ lea srcq, [srcq+ssq*2] ++ add srcq, ssq ++ movhps m3, [srcq+ssq*0] ; 1 _ 3 ++ lea srcq, [srcq+ssq*1] ++%else ++ movhps m3, [srcq+ss3q ] ; 1 _ 3 ++ lea srcq, [srcq+ssq*4] ++%endif ++ pshufd m2, m2, q2020 ; 0 2 0 2 ++ pshufd m3, m3, q2020 ; 1 3 1 3 ++ punpckldq m2, m3 ; 0 1 2 3 ++ movd m3, [srcq+ssq*0] ; 4 ++ movd m1, [srcq+ssq*1] ; 5 ++ movd m0, [srcq+ssq*2] ; 6 ++%if ARCH_X86_32 ++ lea srcq, [srcq+ssq*2] ++ add srcq, ssq ++%else ++ add srcq, ss3q ++%endif ++ punpckldq m3, m1 ; 4 5 _ _ ++ punpckldq m1, m0 ; 5 6 _ _ ++ palignr m4, m3, m2, 4 ; 1 2 3 4 ++ punpcklbw m3, m1 ; 45 56 ++ punpcklbw m1, m2, m4 ; 01 12 ++ punpckhbw m2, m4 ; 23 34 ++.v_w4_loop: ++ pmaddubsw m5, m1, subpel0 ; a0 b0 ++ mova m1, m2 ++ pmaddubsw m2, subpel1 ; a1 b1 ++ paddw m5, m2 ++ mova m2, m3 ++ pmaddubsw m3, subpel2 ; a2 b2 ++ paddw m5, m3 ++ movd m4, [srcq+ssq*0] ++ punpckldq m3, m0, m4 ; 6 7 _ _ ++ movd m0, [srcq+ssq*1] ++ lea srcq, [srcq+ssq*2] ++ punpckldq m4, m0 ; 7 8 _ _ ++ punpcklbw m3, m4 ; 67 78 ++ pmaddubsw m4, m3, subpel3 ; a3 b3 ++ paddw m5, m4 ++ pmulhrsw m5, m7 ++ packuswb m5, m5 ++ movd [dstq+dsq*0], m5 ++ pshufd m5, m5, q0101 ++ movd [dstq+dsq*1], m5 ++ lea dstq, [dstq+dsq*2] ++ sub hd, 2 ++ jg .v_w4_loop ++ mov hw, r6w ; reset vertical loop ++ add r4, 4 ++ mov dstq, r4 ++%if ARCH_X86_32 ++ mov srcq, srcm ++ add srcq, 4 ++ mov srcm, srcq ++%else ++ add r7, 4 ++ mov srcq, r7 ++%endif ++ sub r6d, 1<<16 ; horizontal-- ++ jg .v_w4_loop0 ++ RET ++%if ARCH_X86_64 ++.v_w8: ++.v_w16: ++.v_w32: ++.v_w64: ++.v_w128: ++ lea r6d, [wq - 8] ; horizontal loop ++ mov r4, dstq ++ mov r7, srcq ++ shl r6d, 8 - 3; (wq / 8) << 8 ++ mov r6b, hb ++.v_w8_loop0: ++ movq m4, [srcq+ssq*0] ; 0 ++ movq m5, [srcq+ssq*1] ; 1 ++ lea srcq, [srcq+ssq*2] ++ movq m6, [srcq+ssq*0] ; 2 ++ movq m0, [srcq+ssq*1] ; 3 ++ lea srcq, [srcq+ssq*2] ++ movq m1, [srcq+ssq*0] ; 4 ++ movq m2, [srcq+ssq*1] ; 5 ++ lea srcq, [srcq+ssq*2] ; ++ movq m3, [srcq+ssq*0] ; 6 ++ shufpd m4, m0, 0x0c ++ shufpd m5, m1, 0x0c ++ punpcklbw m1, m4, m5 ; 01 ++ punpckhbw m4, m5 ; 34 ++ shufpd m6, m2, 0x0c ++ punpcklbw m2, m5, m6 ; 12 ++ punpckhbw m5, m6 ; 45 ++ shufpd m0, m3, 0x0c ++ punpcklbw m3, m6, m0 ; 23 ++ punpckhbw m6, m0 ; 56 ++.v_w8_loop: ++ movq m12, [srcq+ssq*1] ; 8 ++ lea srcq, [srcq+ssq*2] ++ movq m13, [srcq+ssq*0] ; 9 ++ pmaddubsw m14, m1, subpel0 ; a0 ++ pmaddubsw m15, m2, subpel0 ; b0 ++ mova m1, m3 ++ mova m2, m4 ++ pmaddubsw m3, subpel1 ; a1 ++ pmaddubsw m4, subpel1 ; b1 ++ paddw m14, m3 ++ paddw m15, m4 ++ mova m3, m5 ++ mova m4, m6 ++ pmaddubsw m5, subpel2 ; a2 ++ pmaddubsw m6, subpel2 ; b2 ++ paddw m14, m5 ++ paddw m15, m6 ++ shufpd m6, m0, m12, 0x0d ++ shufpd m0, m12, m13, 0x0c ++ punpcklbw m5, m6, m0 ; 67 ++ punpckhbw m6, m0 ; 78 ++ pmaddubsw m12, m5, subpel3 ; a3 ++ pmaddubsw m13, m6, subpel3 ; b3 ++ paddw m14, m12 ++ paddw m15, m13 ++ pmulhrsw m14, m7 ++ pmulhrsw m15, m7 ++ packuswb m14, m15 ++ movq [dstq+dsq*0], xm14 ++ movhps [dstq+dsq*1], xm14 ++ lea dstq, [dstq+dsq*2] ++ sub hd, 2 ++ jg .v_w8_loop ++ movzx hd, r6b ; reset vertical loop ++ add r4, 8 ++ add r7, 8 ++ mov dstq, r4 ++ mov srcq, r7 ++ sub r6d, 1<<8 ; horizontal-- ++ jg .v_w8_loop0 ++ RET ++%endif ;ARCH_X86_64 ++%undef subpel0 ++%undef subpel1 ++%undef subpel2 ++%undef subpel3 ++.hv: ++ %assign stack_offset org_stack_offset ++ cmp wd, 4 ++ jg .hv_w8 ++ and mxd, 0xff ++ dec srcq ++ movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2] ++%if ARCH_X86_32 ++ movzx mxd, ssb ++ shr ssd, 16 ++ cmp hd, 4 ++ cmovle ssd, mxd ++ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] ++ W32_RESTORE_SSQ ++ lea r6, [ssq*3] ++ sub srcq, r6 ++ %define base_reg r6 ++ mov r6, r1; use as new base ++ %assign regs_used 2 ++ ALLOC_STACK -mmsize*14 ++ %assign regs_used 7 ++ mov dsq, [rstk+stack_offset+gprsize*2] ++ %define subpelv0 [rsp+mmsize*0] ++ %define subpelv1 [rsp+mmsize*1] ++ %define subpelv2 [rsp+mmsize*2] ++ %define subpelv3 [rsp+mmsize*3] ++ punpcklqdq m0, m0 ++ punpcklbw m0, m0 ++ psraw m0, 8 ; sign-extend ++ pshufd m6, m0, q0000 ++ mova subpelv0, m6 ++ pshufd m6, m0, q1111 ++ mova subpelv1, m6 ++ pshufd m6, m0, q2222 ++ mova subpelv2, m6 ++ pshufd m6, m0, q3333 ++ mova subpelv3, m6 ++%else ++ movzx mxd, myb ++ shr myd, 16 ++ cmp hd, 4 ++ cmovle myd, mxd ++ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] ++ ALLOC_STACK mmsize*14, 14 ++ lea ss3q, [ssq*3] ++ sub srcq, ss3q ++ %define subpelv0 m10 ++ %define subpelv1 m11 ++ %define subpelv2 m12 ++ %define subpelv3 m13 ++ punpcklqdq m0, m0 ++ punpcklbw m0, m0 ++ psraw m0, 8 ; sign-extend ++ mova m8, [base+pw_8192] ++ mova m9, [base+pd_512] ++ pshufd m10, m0, q0000 ++ pshufd m11, m0, q1111 ++ pshufd m12, m0, q2222 ++ pshufd m13, m0, q3333 ++%endif ++ pshufd m7, m1, q0000 ++ cmp wd, 4 ++ je .hv_w4 ++.hv_w2: ++ mova m6, [base+subpel_h_shuf4] ++ ; ++ movq m2, [srcq+ssq*0] ; 0 ++ movhps m2, [srcq+ssq*1] ; 0 _ 1 ++ movq m0, [srcq+ssq*2] ; 2 ++%if ARCH_X86_32 ++ %define w8192reg [base+pw_8192] ++ %define d512reg [base+pd_512] ++ lea srcq, [srcq+ssq*2] ++ add srcq, ssq ++ movhps m0, [srcq+ssq*0] ; 2 _ 3 ++ lea srcq, [srcq+ssq*1] ++%else ++ %define w8192reg m8 ++ %define d512reg m9 ++ movhps m0, [srcq+ss3q ] ; 2 _ 3 ++ lea srcq, [srcq+ssq*4] ++%endif ++ pshufb m2, m6 ; 0 ~ 1 ~ ++ pshufb m0, m6 ; 2 ~ 3 ~ ++ pmaddubsw m2, m7 ; subpel_filters ++ pmaddubsw m0, m7 ; subpel_filters ++ phaddw m2, m0 ; 0 1 2 3 ++ pmulhrsw m2, w8192reg ++ ; ++ movq m3, [srcq+ssq*0] ; 4 ++ movhps m3, [srcq+ssq*1] ; 4 _ 5 ++ movq m0, [srcq+ssq*2] ; 6 ++%if ARCH_X86_32 ++ lea srcq, [srcq+ssq*2] ++ add srcq, ssq ++%else ++ add srcq, ss3q ++%endif ++ pshufb m3, m6 ; 4 ~ 5 ~ ++ pshufb m0, m6 ; 6 ~ ++ pmaddubsw m3, m7 ; subpel_filters ++ pmaddubsw m0, m7 ; subpel_filters ++ phaddw m3, m0 ; 4 5 6 _ ++ pmulhrsw m3, w8192reg ++ ; ++ palignr m4, m3, m2, 4; V 1 2 3 4 ++ punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 ++ punpckhwd m2, m4 ; V 23 34 2 3 3 4 ++ pshufd m0, m3, q2121; V 5 6 5 6 ++ punpcklwd m3, m0 ; V 45 56 4 5 5 6 ++.hv_w2_loop: ++ pmaddwd m5, m1, subpelv0; V a0 b0 ++ mova m1, m2 ; V ++ pmaddwd m2, subpelv1 ; V a1 b1 ++ paddd m5, m2 ; V ++ mova m2, m3 ; V ++ pmaddwd m3, subpelv2 ; a2 b2 ++ paddd m5, m3 ; V ++ movq m4, [srcq+ssq*0] ; V 7 ++ movhps m4, [srcq+ssq*1] ; V 7 8 ++ lea srcq, [srcq+ssq*2] ; V ++ pshufb m4, m6 ++ pmaddubsw m4, m7 ++ phaddw m4, m4 ++ pmulhrsw m4, w8192reg ++ palignr m3, m4, m0, 12 ++ mova m0, m4 ++ punpcklwd m3, m0 ; V 67 78 ++ pmaddwd m4, m3, subpelv3 ; V a3 b3 ++ paddd m5, d512reg ++ paddd m5, m4 ++ psrad m5, 10 ++ packssdw m5, m5 ++ packuswb m5, m5 ++ movd r4d, m5 ++ mov [dstq+dsq*0], r4w ++ shr r4d, 16 ++ mov [dstq+dsq*1], r4w ++ lea dstq, [dstq+dsq*2] ++ sub hd, 2 ++ jg .hv_w2_loop ++ RET ++%undef w8192reg ++%undef d512reg ++ ; ++.hv_w4: ++%define hv4_line_0_0 4 ++%define hv4_line_0_1 5 ++%define hv4_line_0_2 6 ++%define hv4_line_0_3 7 ++%define hv4_line_0_4 8 ++%define hv4_line_0_5 9 ++%define hv4_line_1_0 10 ++%define hv4_line_1_1 11 ++%define hv4_line_1_2 12 ++%define hv4_line_1_3 13 ++ ; ++%macro SAVELINE_W4 3 ++ mova [rsp+mmsize*hv4_line_%3_%2], %1 ++%endmacro ++%macro RESTORELINE_W4 3 ++ mova %1, [rsp+mmsize*hv4_line_%3_%2] ++%endmacro ++ ; ++%if ARCH_X86_32 ++ %define w8192reg [base+pw_8192] ++ %define d512reg [base+pd_512] ++%else ++ %define w8192reg m8 ++ %define d512reg m9 ++%endif ++ ; lower shuffle 0 1 2 3 4 ++ mova m6, [base+subpel_h_shuf4] ++ movq m5, [srcq+ssq*0] ; 0 _ _ _ ++ movhps m5, [srcq+ssq*1] ; 0 _ 1 _ ++ movq m4, [srcq+ssq*2] ; 2 _ _ _ ++%if ARCH_X86_32 ++ lea srcq, [srcq+ssq*2] ++ add srcq, ssq ++ movhps m4, [srcq+ssq*0] ; 2 _ 3 _ ++ add srcq, ssq ++%else ++ movhps m4, [srcq+ss3q ] ; 2 _ 3 _ ++ lea srcq, [srcq+ssq*4] ++%endif ++ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ ++ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ ++ pmaddubsw m2, m7 ;H subpel_filters ++ pmaddubsw m0, m7 ;H subpel_filters ++ phaddw m2, m0 ;H 0 1 2 3 ++ pmulhrsw m2, w8192reg ;H pw_8192 ++ SAVELINE_W4 m2, 2, 0 ++ ; upper shuffle 2 3 4 5 6 ++ mova m6, [base+subpel_h_shuf4+16] ++ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ ++ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ ++ pmaddubsw m2, m7 ;H subpel_filters ++ pmaddubsw m0, m7 ;H subpel_filters ++ phaddw m2, m0 ;H 0 1 2 3 ++ pmulhrsw m2, w8192reg ;H pw_8192 ++ ; ++ ; lower shuffle ++ mova m6, [base+subpel_h_shuf4] ++ movq m5, [srcq+ssq*0] ; 4 _ _ _ ++ movhps m5, [srcq+ssq*1] ; 4 _ 5 _ ++ movq m4, [srcq+ssq*2] ; 6 _ _ _ ++ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ ++ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ ++ pmaddubsw m3, m7 ;H subpel_filters ++ pmaddubsw m0, m7 ;H subpel_filters ++ phaddw m3, m0 ;H 4 5 6 7 ++ pmulhrsw m3, w8192reg ;H pw_8192 ++ SAVELINE_W4 m3, 3, 0 ++ ; upper shuffle ++ mova m6, [base+subpel_h_shuf4+16] ++ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ ++ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ ++ pmaddubsw m3, m7 ;H subpel_filters ++ pmaddubsw m0, m7 ;H subpel_filters ++ phaddw m3, m0 ;H 4 5 6 7 ++ pmulhrsw m3, w8192reg ;H pw_8192 ++ ; ++%if ARCH_X86_32 ++ lea srcq, [srcq+ssq*2] ++ add srcq, ssq ++%else ++ add srcq, ss3q ++%endif ++ ;process high ++ palignr m4, m3, m2, 4;V 1 2 3 4 ++ punpcklwd m1, m2, m4 ; V 01 12 ++ punpckhwd m2, m4 ; V 23 34 ++ pshufd m0, m3, q2121;V 5 6 5 6 ++ punpcklwd m3, m0 ; V 45 56 ++ SAVELINE_W4 m0, 0, 1 ++ SAVELINE_W4 m1, 1, 1 ++ SAVELINE_W4 m2, 2, 1 ++ SAVELINE_W4 m3, 3, 1 ++ ;process low ++ RESTORELINE_W4 m2, 2, 0 ++ RESTORELINE_W4 m3, 3, 0 ++ palignr m4, m3, m2, 4;V 1 2 3 4 ++ punpcklwd m1, m2, m4 ; V 01 12 ++ punpckhwd m2, m4 ; V 23 34 ++ pshufd m0, m3, q2121;V 5 6 5 6 ++ punpcklwd m3, m0 ; V 45 56 ++.hv_w4_loop: ++ ;process low ++ pmaddwd m5, m1, subpelv0 ; V a0 b0 ++ mova m1, m2 ++ pmaddwd m2, subpelv1; V a1 b1 ++ paddd m5, m2 ++ mova m2, m3 ++ pmaddwd m3, subpelv2; V a2 b2 ++ paddd m5, m3 ++ ; ++ mova m6, [base+subpel_h_shuf4] ++ movq m4, [srcq+ssq*0] ; 7 ++ movhps m4, [srcq+ssq*1] ; 7 _ 8 _ ++ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ ++ pmaddubsw m4, m7 ;H subpel_filters ++ phaddw m4, m4 ;H 7 8 7 8 ++ pmulhrsw m4, w8192reg ;H pw_8192 ++ palignr m3, m4, m0, 12 ; 6 7 8 7 ++ mova m0, m4 ++ punpcklwd m3, m4 ; 67 78 ++ pmaddwd m4, m3, subpelv3; a3 b3 ++ paddd m5, d512reg ; pd_512 ++ paddd m5, m4 ++ psrad m5, 10 ++ SAVELINE_W4 m0, 0, 0 ++ SAVELINE_W4 m1, 1, 0 ++ SAVELINE_W4 m2, 2, 0 ++ SAVELINE_W4 m3, 3, 0 ++ SAVELINE_W4 m5, 5, 0 ++ ;process high ++ RESTORELINE_W4 m0, 0, 1 ++ RESTORELINE_W4 m1, 1, 1 ++ RESTORELINE_W4 m2, 2, 1 ++ RESTORELINE_W4 m3, 3, 1 ++ pmaddwd m5, m1, subpelv0; V a0 b0 ++ mova m1, m2 ++ pmaddwd m2, subpelv1; V a1 b1 ++ paddd m5, m2 ++ mova m2, m3 ++ pmaddwd m3, subpelv2; V a2 b2 ++ paddd m5, m3 ++ ; ++ mova m6, [base+subpel_h_shuf4+16] ++ movq m4, [srcq+ssq*0] ; 7 ++ movhps m4, [srcq+ssq*1] ; 7 _ 8 _ ++ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ ++ pmaddubsw m4, m7 ;H subpel_filters ++ phaddw m4, m4 ;H 7 8 7 8 ++ pmulhrsw m4, w8192reg ;H pw_8192 ++ palignr m3, m4, m0, 12 ; 6 7 8 7 ++ mova m0, m4 ++ punpcklwd m3, m4 ; 67 78 ++ pmaddwd m4, m3, subpelv3; a3 b3 ++ paddd m5, d512reg ; pd_512 ++ paddd m5, m4 ++ psrad m4, m5, 10 ++ ; ++ RESTORELINE_W4 m5, 5, 0 ++ packssdw m5, m4 ; d -> w ++ packuswb m5, m5 ; w -> b ++ pshuflw m5, m5, q3120 ++ lea srcq, [srcq+ssq*2] ++ movd [dstq+dsq*0], m5 ++ psrlq m5, 32 ++ movd [dstq+dsq*1], m5 ++ lea dstq, [dstq+dsq*2] ++ sub hd, 2 ++ SAVELINE_W4 m0, 0, 1 ++ SAVELINE_W4 m1, 1, 1 ++ SAVELINE_W4 m2, 2, 1 ++ SAVELINE_W4 m3, 3, 1 ++ RESTORELINE_W4 m0, 0, 0 ++ RESTORELINE_W4 m1, 1, 0 ++ RESTORELINE_W4 m2, 2, 0 ++ RESTORELINE_W4 m3, 3, 0 ++ jg .hv_w4_loop ++ RET ++%undef subpelv0 ++%undef subpelv1 ++%undef subpelv2 ++%undef subpelv3 ++ ; ++.hv_w8: ++ %assign stack_offset org_stack_offset ++%define hv8_line_1 0 ++%define hv8_line_2 1 ++%define hv8_line_3 2 ++%define hv8_line_4 3 ++%define hv8_line_6 4 ++%macro SAVELINE_W8 2 ++ mova [rsp+hv8_line_%1*mmsize], %2 ++%endmacro ++%macro RESTORELINE_W8 2 ++ mova %2, [rsp+hv8_line_%1*mmsize] ++%endmacro ++ shr mxd, 16 ++ sub srcq, 3 ++%if ARCH_X86_32 ++ %define base_reg r1 ++ %define subpelh0 [rsp+mmsize*5] ++ %define subpelh1 [rsp+mmsize*6] ++ %define subpelv0 [rsp+mmsize*7] ++ %define subpelv1 [rsp+mmsize*8] ++ %define subpelv2 [rsp+mmsize*9] ++ %define subpelv3 [rsp+mmsize*10] ++ %define accuv0 [rsp+mmsize*11] ++ %define accuv1 [rsp+mmsize*12] ++ movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3] ++ movzx mxd, ssb ++ shr ssd, 16 ++ cmp hd, 4 ++ cmovle ssd, mxd ++ movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3] ++ mov ssq, ssmp ++ ALLOC_STACK -mmsize*13 ++%if STACK_ALIGNMENT < 16 ++ %define srcm [rsp+mmsize*13+gprsize*1] ++ %define dsm [rsp+mmsize*13+gprsize*2] ++ mov r6, [rstk+stack_offset+gprsize*2] ++ mov dsm, r6 ++%endif ++ pshufd m0, m1, q0000 ++ pshufd m1, m1, q1111 ++ punpcklbw m5, m5 ++ psraw m5, 8 ; sign-extend ++ pshufd m2, m5, q0000 ++ pshufd m3, m5, q1111 ++ pshufd m4, m5, q2222 ++ pshufd m5, m5, q3333 ++ mova subpelh0, m0 ++ mova subpelh1, m1 ++ mova subpelv0, m2 ++ mova subpelv1, m3 ++ mova subpelv2, m4 ++ mova subpelv3, m5 ++ lea r6, [ssq*3] ++ sub srcq, r6 ++ mov srcm, srcq ++%else ++ ALLOC_STACK mmsize*5, 16 ++ %define subpelh0 m10 ++ %define subpelh1 m11 ++ %define subpelv0 m12 ++ %define subpelv1 m13 ++ %define subpelv2 m14 ++ %define subpelv3 m15 ++ %define accuv0 m8 ++ %define accuv1 m9 ++ movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3] ++ movzx mxd, myb ++ shr myd, 16 ++ cmp hd, 4 ++ cmovle myd, mxd ++ movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] ++ pshufd subpelh0, m0, q0000 ++ pshufd subpelh1, m0, q1111 ++ punpcklqdq m1, m1 ++ punpcklbw m1, m1 ++ psraw m1, 8 ; sign-extend ++ pshufd subpelv0, m1, q0000 ++ pshufd subpelv1, m1, q1111 ++ pshufd subpelv2, m1, q2222 ++ pshufd subpelv3, m1, q3333 ++ lea ss3q, [ssq*3] ++ sub srcq, ss3q ++ mov r7, srcq ++%endif ++ lea r6d, [wq-4] ++ mov r4, dstq ++ shl r6d, (16 - 2) ++ mov r6w, hw ++.hv_w8_loop0: ++ movu m4, [srcq+ssq*0] ; 0 = _ _ ++ movu m5, [srcq+ssq*1] ; 1 = _ _ ++ lea srcq, [srcq+ssq*2] ++ ; ++%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] ++ %if ARCH_X86_32 ++ pshufb %3, %1, [base+subpel_h_shufB] ++ pshufb %4, %1, [base+subpel_h_shufC] ++ pshufb %1, [base+subpel_h_shufA] ++ %else ++ pshufb %3, %1, %6 ; subpel_h_shufB ++ pshufb %4, %1, %7 ; subpel_h_shufC ++ pshufb %1, %5 ; subpel_h_shufA ++ %endif ++ pmaddubsw %2, %3, subpelh0 ; subpel +0 C0 ++ pmaddubsw %4, subpelh1; subpel +4 B4 ++ pmaddubsw %3, subpelh1; C4 ++ pmaddubsw %1, subpelh0; A0 ++ paddw %2, %4 ; C0+B4 ++ paddw %1, %3 ; A0+C4 ++ phaddw %1, %2 ++%endmacro ++ ; ++%if ARCH_X86_64 ++ mova m7, [base+subpel_h_shufA] ++ mova m8, [base+subpel_h_shufB] ++ mova m9, [base+subpel_h_shufC] ++%endif ++ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ ++ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ ++ movu m6, [srcq+ssq*0] ; 2 = _ _ ++ movu m0, [srcq+ssq*1] ; 3 = _ _ ++ lea srcq, [srcq+ssq*2] ++ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ ++ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ ++ ; ++ mova m7, [base+pw_8192] ++ pmulhrsw m4, m7 ; H pw_8192 ++ pmulhrsw m5, m7 ; H pw_8192 ++ pmulhrsw m6, m7 ; H pw_8192 ++ pmulhrsw m0, m7 ; H pw_8192 ++ punpcklwd m1, m4, m5 ; 0 1 ~ ++ punpcklwd m2, m5, m6 ; 1 2 ~ ++ punpcklwd m3, m6, m0 ; 2 3 ~ ++ SAVELINE_W8 1, m1 ++ SAVELINE_W8 2, m2 ++ SAVELINE_W8 3, m3 ++ ; ++ mova m7, [base+subpel_h_shufA] ++ movu m4, [srcq+ssq*0] ; 4 = _ _ ++ movu m5, [srcq+ssq*1] ; 5 = _ _ ++ lea srcq, [srcq+ssq*2] ++ movu m6, [srcq+ssq*0] ; 6 = _ _ ++ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ ++ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ ++ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ ++ mova m7, [base+pw_8192] ++ pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ ++ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ ++ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ ++ punpcklwd m4, m0, m1 ; 3 4 ~ ++ punpcklwd m5, m1, m2 ; 4 5 ~ ++ punpcklwd m6, m2, m3 ; 5 6 ~ ++ ; ++ SAVELINE_W8 6, m3 ++ RESTORELINE_W8 1, m1 ++ RESTORELINE_W8 2, m2 ++ RESTORELINE_W8 3, m3 ++.hv_w8_loop: ++ ; m8 accu for V a ++ ; m9 accu for V b ++ SAVELINE_W8 1, m3 ++ SAVELINE_W8 2, m4 ++ SAVELINE_W8 3, m5 ++ SAVELINE_W8 4, m6 ++%if ARCH_X86_32 ++ pmaddwd m0, m1, subpelv0 ; a0 ++ pmaddwd m7, m2, subpelv0 ; b0 ++ pmaddwd m3, subpelv1 ; a1 ++ pmaddwd m4, subpelv1 ; b1 ++ paddd m0, m3 ++ paddd m7, m4 ++ pmaddwd m5, subpelv2 ; a2 ++ pmaddwd m6, subpelv2 ; b2 ++ paddd m0, m5 ++ paddd m7, m6 ++ mova m5, [base+pd_512] ++ paddd m0, m5 ; pd_512 ++ paddd m7, m5 ; pd_512 ++ mova accuv0, m0 ++ mova accuv1, m7 ++%else ++ pmaddwd m8, m1, subpelv0 ; a0 ++ pmaddwd m9, m2, subpelv0 ; b0 ++ pmaddwd m3, subpelv1 ; a1 ++ pmaddwd m4, subpelv1 ; b1 ++ paddd m8, m3 ++ paddd m9, m4 ++ pmaddwd m5, subpelv2 ; a2 ++ pmaddwd m6, subpelv2 ; b2 ++ paddd m8, m5 ++ paddd m9, m6 ++ mova m7, [base+pd_512] ++ paddd m8, m7 ; pd_512 ++ paddd m9, m7 ; pd_512 ++ mova m7, [base+subpel_h_shufB] ++ mova m6, [base+subpel_h_shufC] ++ mova m5, [base+subpel_h_shufA] ++%endif ++ movu m0, [srcq+ssq*1] ; 7 ++ movu m4, [srcq+ssq*2] ; 8 ++ lea srcq, [srcq+ssq*2] ++ HV_H_W8 m0, m1, m2, m3, m5, m7, m6 ++ HV_H_W8 m4, m1, m2, m3, m5, m7, m6 ++ mova m5, [base+pw_8192] ++ pmulhrsw m0, m5 ; H pw_8192 ++ pmulhrsw m4, m5 ; H pw_8192 ++ RESTORELINE_W8 6, m6 ++ punpcklwd m5, m6, m0 ; 6 7 ~ ++ punpcklwd m6, m0, m4 ; 7 8 ~ ++ pmaddwd m1, m5, subpelv3 ; a3 ++ paddd m2, m1, accuv0 ++ pmaddwd m1, m6, subpelv3 ; b3 ++ paddd m1, m1, accuv1 ; H + V ++ psrad m2, 10 ++ psrad m1, 10 ++ packssdw m2, m1 ; d -> w ++ packuswb m2, m1 ; w -> b ++ movd [dstq+dsq*0], m2 ++ psrlq m2, 32 ++%if ARCH_X86_32 ++ add dstq, dsm ++ movd [dstq+dsq*0], m2 ++ add dstq, dsm ++%else ++ movd [dstq+dsq*1], m2 ++ lea dstq, [dstq+dsq*2] ++%endif ++ sub hd, 2 ++ jle .hv_w8_outer ++ SAVELINE_W8 6, m4 ++ RESTORELINE_W8 1, m1 ++ RESTORELINE_W8 2, m2 ++ RESTORELINE_W8 3, m3 ++ RESTORELINE_W8 4, m4 ++ jmp .hv_w8_loop ++.hv_w8_outer: ++ movzx hd, r6w ++ add r4, 4 ++ mov dstq, r4 ++%if ARCH_X86_32 ++ mov srcq, srcm ++ add srcq, 4 ++ mov srcm, srcq ++%else ++ add r7, 4 ++ mov srcq, r7 ++%endif ++ sub r6d, 1<<16 ++ jg .hv_w8_loop0 ++ RET ++ ++%if ARCH_X86_32 ++DECLARE_REG_TMP 1, 2 ++%elif WIN64 ++DECLARE_REG_TMP 6, 4 ++%else ++DECLARE_REG_TMP 6, 7 ++%endif ++%macro PREP_8TAP_FN 3 ; type, type_h, type_v ++cglobal prep_8tap_%1 ++ mov t0d, FILTER_%2 ++ mov t1d, FILTER_%3 ++%ifnidn %1, sharp_smooth ; skip the jump in the last filter ++ jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX) ++%endif ++%endmacro ++ ++PREP_8TAP_FN regular, REGULAR, REGULAR ++PREP_8TAP_FN regular_sharp, REGULAR, SHARP ++PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH ++PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR ++PREP_8TAP_FN smooth, SMOOTH, SMOOTH ++PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP ++PREP_8TAP_FN sharp_regular, SHARP, REGULAR ++PREP_8TAP_FN sharp, SHARP, SHARP ++PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH ++ ++%if ARCH_X86_32 ++ %define base_reg r2 ++ %define base base_reg-prep_ssse3 ++ %define W32_RESTORE_SSQ mov strideq, stridem ++%else ++ %define base_reg r7 ++ %define base 0 ++ %define W32_RESTORE_SSQ ++%endif ++ ++cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 ++%assign org_stack_offset stack_offset ++ imul mxd, mxm, 0x010101 ++ add mxd, t0d ; 8tap_h, mx, 4tap_h ++ imul myd, mym, 0x010101 ++ add myd, t1d ; 8tap_v, my, 4tap_v ++ movsxd wq, wm ++ movifnidn srcd, srcm ++ movifnidn hd, hm ++ LEA base_reg, prep_ssse3 ++ test mxd, 0xf00 ++ jnz .h ++ test myd, 0xf00 ++ jnz .v ++ tzcnt wd, wd ++ movzx wd, word [base_reg+wq*2+table_offset(prep,)] ++ add wq, base_reg ++ movifnidn strided, stridem ++ lea r6, [strideq*3] ++ %assign stack_offset org_stack_offset ++%if WIN64 ++ pop r8 ++ pop r7 ++%endif ++ jmp wq ++.h: ++ test myd, 0xf00 ++ jnz .hv ++ WIN64_SPILL_XMM 12 ++ cmp wd, 4 ++ je .h_w4 ++ tzcnt wd, wd ++%if ARCH_X86_64 ++ mova m10, [base+subpel_h_shufA] ++ mova m11, [base+subpel_h_shufB] ++ mova m9, [base+subpel_h_shufC] ++%endif ++ shr mxd, 16 ++ sub srcq, 3 ++ movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] ++ movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0] ++ pshufd m5, m5, q0000 ++ movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4] ++ pshufd m6, m6, q0000 ++ mova m7, [base+pw_8192] ++ add wq, base_reg ++ jmp wq ++.h_w4: ++%if ARCH_X86_32 ++ and mxd, 0xff ++%else ++ movzx mxd, mxb ++%endif ++ dec srcq ++ movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] ++ pshufd m4, m4, q0000 ++ mova m6, [base+pw_8192] ++ mova m5, [base+subpel_h_shufA] ++ W32_RESTORE_SSQ ++%if ARCH_X86_64 ++ lea stride3q, [strideq*3] ++%endif ++.h_w4_loop: ++ movq m0, [srcq+strideq*0] ; 0 ++ movq m1, [srcq+strideq*1] ; 1 ++%if ARCH_X86_32 ++ lea srcq, [srcq+strideq*2] ++ movq m2, [srcq+strideq*0] ; 2 ++ movq m3, [srcq+strideq*1] ; 3 ++ lea srcq, [srcq+strideq*2] ++%else ++ movq m2, [srcq+strideq*2] ; 2 ++ movq m3, [srcq+stride3q ] ; 3 ++ lea srcq, [srcq+strideq*4] ++%endif ++ pshufb m0, m5 ; subpel_h_shufA ++ pshufb m1, m5 ++ pshufb m2, m5 ++ pshufb m3, m5 ++ pmaddubsw m0, m4 ; subpel_filters + 2 ++ pmaddubsw m1, m4 ++ pmaddubsw m2, m4 ++ pmaddubsw m3, m4 ++ phaddw m0, m1 ++ phaddw m2, m3 ++ pmulhrsw m0, m6 ; pw_8192 ++ pmulhrsw m2, m6 ; pw_8192 ++ mova [tmpq+16*0], m0 ++ mova [tmpq+16*1], m2 ++ add tmpq, 32 ++ sub hd, 4 ++ jg .h_w4_loop ++ RET ++ ; ++%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3] ++%if ARCH_X86_32 ++ pshufb %2, %1, [base+subpel_h_shufB] ++ pshufb %3, %1, [base+subpel_h_shufC] ++ pshufb %1, [base+subpel_h_shufA] ++%else ++ pshufb %2, %1, m11; subpel_h_shufB ++ pshufb %3, %1, m9 ; subpel_h_shufC ++ pshufb %1, m10 ; subpel_h_shufA ++%endif ++ pmaddubsw %4, %2, m5 ; subpel +0 B0 ++ pmaddubsw %2, m6 ; subpel +4 B4 ++ pmaddubsw %3, m6 ; subpel +4 C4 ++ pmaddubsw %1, m5 ; subpel +0 A0 ++ paddw %3, %4 ++ paddw %1, %2 ++ phaddw %1, %3 ++ pmulhrsw %1, m7 ; 8192 ++%endmacro ++ ; ++.h_w8: ++%if ARCH_X86_32 ++ mov r3, r2 ++ %define base_reg r3 ++ W32_RESTORE_SSQ ++%endif ++.h_w8_loop: ++ movu m0, [srcq+strideq*0] ++ movu m1, [srcq+strideq*1] ++ lea srcq, [srcq+strideq*2] ++ PREP_8TAP_H m0, m2, m3, m4 ++ PREP_8TAP_H m1, m2, m3, m4 ++ mova [tmpq+16*0], m0 ++ mova [tmpq+16*1], m1 ++ add tmpq, 32 ++ sub hd, 2 ++ jg .h_w8_loop ++ RET ++.h_w16: ++ xor r6d, r6d ++ jmp .h_start ++.h_w32: ++ mov r6, -16*1 ++ jmp .h_start ++.h_w64: ++ mov r6, -16*3 ++ jmp .h_start ++.h_w128: ++ mov r6, -16*7 ++.h_start: ++%if ARCH_X86_32 ++ mov r3, r2 ++ %define base_reg r3 ++%endif ++ sub srcq, r6 ++ mov r5, r6 ++ W32_RESTORE_SSQ ++.h_loop: ++ movu m0, [srcq+r6+8*0] ++ movu m1, [srcq+r6+8*1] ++ PREP_8TAP_H m0, m2, m3, m4 ++ PREP_8TAP_H m1, m2, m3, m4 ++ mova [tmpq+16*0], m0 ++ mova [tmpq+16*1], m1 ++ add tmpq, 32 ++ add r6, 16 ++ jle .h_loop ++ add srcq, strideq ++ mov r6, r5 ++ dec hd ++ jg .h_loop ++ RET ++%if ARCH_X86_32 ++ %define base_reg r2 ++%endif ++ ++.v: ++%if ARCH_X86_32 ++ mov mxd, myd ++ and mxd, 0xff ++%else ++ %assign stack_offset org_stack_offset ++ WIN64_SPILL_XMM 16 ++ movzx mxd, myb ++%endif ++ shr myd, 16 ++ cmp hd, 4 ++ cmovle myd, mxd ++ lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3] ++ mova m2, [base+pw_512] ++ psrlw m2, m2, 1 ; 0x0100 ++ mova m7, [base+pw_8192] ++%if ARCH_X86_32 ++ %define subpel0 [rsp+mmsize*0] ++ %define subpel1 [rsp+mmsize*1] ++ %define subpel2 [rsp+mmsize*2] ++ %define subpel3 [rsp+mmsize*3] ++%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed ++ ALLOC_STACK -mmsize*4 ++%assign regs_used 7 ++ movd m0, [myq+0] ++ pshufb m0, m2 ++ mova subpel0, m0 ++ movd m0, [myq+2] ++ pshufb m0, m2 ++ mova subpel1, m0 ++ movd m0, [myq+4] ++ pshufb m0, m2 ++ mova subpel2, m0 ++ movd m0, [myq+6] ++ pshufb m0, m2 ++ mova subpel3, m0 ++ mov strideq, [rstk+stack_offset+gprsize*3] ++ lea strideq, [strideq*3] ++ sub [rstk+stack_offset+gprsize*2], strideq ++ mov strideq, [rstk+stack_offset+gprsize*3] ++ mov srcq, [rstk+stack_offset+gprsize*2] ++%else ++ %define subpel0 m8 ++ %define subpel1 m9 ++ %define subpel2 m10 ++ %define subpel3 m11 ++ movd subpel0, [myq+0] ++ pshufb subpel0, m2 ++ movd subpel1, [myq+2] ++ pshufb subpel1, m2 ++ movd subpel2, [myq+4] ++ pshufb subpel2, m2 ++ movd subpel3, [myq+6] ++ pshufb subpel3, m2 ++ lea stride3q, [strideq*3] ++ sub srcq, stride3q ++ cmp wd, 8 ++ jg .v_w16 ++ je .v_w8 ++%endif ++.v_w4: ++%if ARCH_X86_32 ++%if STACK_ALIGNMENT < mmsize ++ %define srcm [rsp+mmsize*4+gprsize*1] ++ %define tmpm [rsp+mmsize*4+gprsize*2] ++%endif ++ mov tmpm, tmpq ++ mov srcm, srcq ++ lea r5d, [wq - 4] ; horizontal loop ++ shl r5d, (16 - 2) ; (wq / 4) << 16 ++ mov r5w, hw ++.v_w4_loop0: ++%endif ++ movd m2, [srcq+strideq*0] ; 0 ++ movhps m2, [srcq+strideq*2] ; 0 _ 2 ++ movd m3, [srcq+strideq*1] ; 1 ++%if ARCH_X86_32 ++ lea srcq, [srcq+strideq*2] ++ movhps m3, [srcq+strideq*1] ; 1 _ 3 ++ lea srcq, [srcq+strideq*2] ++%else ++ movhps m3, [srcq+stride3q ] ; 1 _ 3 ++ lea srcq, [srcq+strideq*4] ++%endif ++ pshufd m2, m2, q2020 ; 0 2 0 2 ++ pshufd m3, m3, q2020 ; 1 3 1 3 ++ punpckldq m2, m3 ; 0 1 2 3 ++ movd m3, [srcq+strideq*0] ; 4 ++ movd m1, [srcq+strideq*1] ; 5 ++ movd m0, [srcq+strideq*2] ; 6 ++%if ARCH_X86_32 ++ lea srcq, [srcq+strideq*2] ++ add srcq, strideq ++%else ++ add srcq, stride3q ++%endif ++ punpckldq m3, m1 ; 4 5 _ _ ++ punpckldq m1, m0 ; 5 6 _ _ ++ palignr m4, m3, m2, 4 ; 1 2 3 4 ++ punpcklbw m3, m1 ; 45 56 ++ punpcklbw m1, m2, m4 ; 01 12 ++ punpckhbw m2, m4 ; 23 34 ++.v_w4_loop: ++ pmaddubsw m5, m1, subpel0 ; a0 b0 ++ mova m1, m2 ++ pmaddubsw m2, subpel1 ; a1 b1 ++ paddw m5, m2 ++ mova m2, m3 ++ pmaddubsw m3, subpel2 ; a2 b2 ++ paddw m5, m3 ++ movd m4, [srcq+strideq*0] ++ punpckldq m3, m0, m4 ; 6 7 _ _ ++ movd m0, [srcq+strideq*1] ++ lea srcq, [srcq+strideq*2] ++ punpckldq m4, m0 ; 7 8 _ _ ++ punpcklbw m3, m4 ; 67 78 ++ pmaddubsw m4, m3, subpel3 ; a3 b3 ++ paddw m5, m4 ++ pmulhrsw m5, m7 ++ movq [tmpq+wq*0], m5 ++ movhps [tmpq+wq*2], m5 ++ lea tmpq, [tmpq+wq*4] ++ sub hd, 2 ++ jg .v_w4_loop ++%if ARCH_X86_32 ++ mov hw, r5w ; reset vertical loop ++ mov tmpq, tmpm ++ mov srcq, srcm ++ add tmpq, 8 ++ add srcq, 4 ++ mov tmpm, tmpq ++ mov srcm, srcq ++ sub r5d, 1<<16 ; horizontal-- ++ jg .v_w4_loop0 ++%endif ++ RET ++ ++%if ARCH_X86_64 ++.v_w8: ++.v_w16: ++ lea r5d, [wq - 8] ; horizontal loop ++ mov r8, tmpq ++ mov r6, srcq ++ shl r5d, 8 - 3; (wq / 8) << 8 ++ mov r5b, hb ++.v_w8_loop0: ++ movq m4, [srcq+strideq*0] ; 0 ++ movq m5, [srcq+strideq*1] ; 1 ++ lea srcq, [srcq+strideq*2] ++ movq m6, [srcq+strideq*0] ; 2 ++ movq m0, [srcq+strideq*1] ; 3 ++ lea srcq, [srcq+strideq*2] ++ movq m1, [srcq+strideq*0] ; 4 ++ movq m2, [srcq+strideq*1] ; 5 ++ lea srcq, [srcq+strideq*2] ; ++ movq m3, [srcq+strideq*0] ; 6 ++ shufpd m4, m0, 0x0c ++ shufpd m5, m1, 0x0c ++ punpcklbw m1, m4, m5 ; 01 ++ punpckhbw m4, m5 ; 34 ++ shufpd m6, m2, 0x0c ++ punpcklbw m2, m5, m6 ; 12 ++ punpckhbw m5, m6 ; 45 ++ shufpd m0, m3, 0x0c ++ punpcklbw m3, m6, m0 ; 23 ++ punpckhbw m6, m0 ; 56 ++.v_w8_loop: ++ movq m12, [srcq+strideq*1] ; 8 ++ lea srcq, [srcq+strideq*2] ++ movq m13, [srcq+strideq*0] ; 9 ++ pmaddubsw m14, m1, subpel0 ; a0 ++ pmaddubsw m15, m2, subpel0 ; b0 ++ mova m1, m3 ++ mova m2, m4 ++ pmaddubsw m3, subpel1 ; a1 ++ pmaddubsw m4, subpel1 ; b1 ++ paddw m14, m3 ++ paddw m15, m4 ++ mova m3, m5 ++ mova m4, m6 ++ pmaddubsw m5, subpel2 ; a2 ++ pmaddubsw m6, subpel2 ; b2 ++ paddw m14, m5 ++ paddw m15, m6 ++ shufpd m6, m0, m12, 0x0d ++ shufpd m0, m12, m13, 0x0c ++ punpcklbw m5, m6, m0 ; 67 ++ punpckhbw m6, m0 ; 78 ++ pmaddubsw m12, m5, subpel3 ; a3 ++ pmaddubsw m13, m6, subpel3 ; b3 ++ paddw m14, m12 ++ paddw m15, m13 ++ pmulhrsw m14, m7 ++ pmulhrsw m15, m7 ++ movu [tmpq+wq*0], xm14 ++ movu [tmpq+wq*2], xm15 ++ lea tmpq, [tmpq+wq*4] ++ sub hd, 2 ++ jg .v_w8_loop ++ movzx hd, r5b ; reset vertical loop ++ add r8, 16 ++ add r6, 8 ++ mov tmpq, r8 ++ mov srcq, r6 ++ sub r5d, 1<<8 ; horizontal-- ++ jg .v_w8_loop0 ++ RET ++%endif ;ARCH_X86_64 ++%undef subpel0 ++%undef subpel1 ++%undef subpel2 ++%undef subpel3 ++ ++.hv: ++ %assign stack_offset org_stack_offset ++ cmp wd, 4 ++ jg .hv_w8 ++ and mxd, 0xff ++ movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] ++%if ARCH_X86_32 ++ mov mxd, myd ++ and mxd, 0xff ++ shr myd, 16 ++ cmp hd, 4 ++ cmovle myd, mxd ++ movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] ++ mov r5, r2; use as new base ++ %define base_reg r5 ++ %assign regs_used 2 ++ ALLOC_STACK -mmsize*14 ++ %assign regs_used 7 ++ mov strideq, [rstk+stack_offset+gprsize*3] ++ lea strideq, [strideq*3 + 1] ++ sub [rstk+stack_offset+gprsize*2], strideq ++ mov strideq, [rstk+stack_offset+gprsize*3] ++ mov srcq, [rstk+stack_offset+gprsize*2] ++ %define subpelv0 [rsp+mmsize*0] ++ %define subpelv1 [rsp+mmsize*1] ++ %define subpelv2 [rsp+mmsize*2] ++ %define subpelv3 [rsp+mmsize*3] ++ punpcklbw m0, m0 ++ psraw m0, 8 ; sign-extend ++ pshufd m6, m0, q0000 ++ mova subpelv0, m6 ++ pshufd m6, m0, q1111 ++ mova subpelv1, m6 ++ pshufd m6, m0, q2222 ++ mova subpelv2, m6 ++ pshufd m6, m0, q3333 ++ mova subpelv3, m6 ++%else ++ movzx mxd, myb ++ shr myd, 16 ++ cmp hd, 4 ++ cmovle myd, mxd ++ movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] ++ ALLOC_STACK mmsize*14, 14 ++ lea stride3q, [strideq*3] ++ sub srcq, stride3q ++ dec srcq ++ %define subpelv0 m10 ++ %define subpelv1 m11 ++ %define subpelv2 m12 ++ %define subpelv3 m13 ++ punpcklbw m0, m0 ++ psraw m0, 8 ; sign-extend ++ mova m8, [base+pw_8192] ++ mova m9, [base+pd_32] ++ pshufd m10, m0, q0000 ++ pshufd m11, m0, q1111 ++ pshufd m12, m0, q2222 ++ pshufd m13, m0, q3333 ++%endif ++ pshufd m7, m1, q0000 ++.hv_w4: ++%define hv4_line_0_0 4 ++%define hv4_line_0_1 5 ++%define hv4_line_0_2 6 ++%define hv4_line_0_3 7 ++%define hv4_line_0_4 8 ++%define hv4_line_0_5 9 ++%define hv4_line_1_0 10 ++%define hv4_line_1_1 11 ++%define hv4_line_1_2 12 ++%define hv4_line_1_3 13 ++ ; ++ ; ++%if ARCH_X86_32 ++ %define w8192reg [base+pw_8192] ++ %define d32reg [base+pd_32] ++%else ++ %define w8192reg m8 ++ %define d32reg m9 ++%endif ++ ; lower shuffle 0 1 2 3 4 ++ mova m6, [base+subpel_h_shuf4] ++ movq m5, [srcq+strideq*0] ; 0 _ _ _ ++ movhps m5, [srcq+strideq*1] ; 0 _ 1 _ ++ movq m4, [srcq+strideq*2] ; 2 _ _ _ ++%if ARCH_X86_32 ++ lea srcq, [srcq+strideq*2] ++ add srcq, strideq ++ movhps m4, [srcq+strideq*0] ; 2 _ 3 _ ++ add srcq, strideq ++%else ++ movhps m4, [srcq+stride3q ] ; 2 _ 3 _ ++ lea srcq, [srcq+strideq*4] ++%endif ++ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ ++ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ ++ pmaddubsw m2, m7 ;H subpel_filters ++ pmaddubsw m0, m7 ;H subpel_filters ++ phaddw m2, m0 ;H 0 1 2 3 ++ pmulhrsw m2, w8192reg ;H pw_8192 ++ SAVELINE_W4 m2, 2, 0 ++ ; upper shuffle 2 3 4 5 6 ++ mova m6, [base+subpel_h_shuf4+16] ++ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ ++ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ ++ pmaddubsw m2, m7 ;H subpel_filters ++ pmaddubsw m0, m7 ;H subpel_filters ++ phaddw m2, m0 ;H 0 1 2 3 ++ pmulhrsw m2, w8192reg ;H pw_8192 ++ ; ++ ; lower shuffle ++ mova m6, [base+subpel_h_shuf4] ++ movq m5, [srcq+strideq*0] ; 4 _ _ _ ++ movhps m5, [srcq+strideq*1] ; 4 _ 5 _ ++ movq m4, [srcq+strideq*2] ; 6 _ _ _ ++ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ ++ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ ++ pmaddubsw m3, m7 ;H subpel_filters ++ pmaddubsw m0, m7 ;H subpel_filters ++ phaddw m3, m0 ;H 4 5 6 7 ++ pmulhrsw m3, w8192reg ;H pw_8192 ++ SAVELINE_W4 m3, 3, 0 ++ ; upper shuffle ++ mova m6, [base+subpel_h_shuf4+16] ++ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ ++ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ ++ pmaddubsw m3, m7 ;H subpel_filters ++ pmaddubsw m0, m7 ;H subpel_filters ++ phaddw m3, m0 ;H 4 5 6 7 ++ pmulhrsw m3, w8192reg ;H pw_8192 ++ ; ++%if ARCH_X86_32 ++ lea srcq, [srcq+strideq*2] ++ add srcq, strideq ++%else ++ add srcq, stride3q ++%endif ++ ;process high ++ palignr m4, m3, m2, 4;V 1 2 3 4 ++ punpcklwd m1, m2, m4 ; V 01 12 ++ punpckhwd m2, m4 ; V 23 34 ++ pshufd m0, m3, q2121;V 5 6 5 6 ++ punpcklwd m3, m0 ; V 45 56 ++ SAVELINE_W4 m0, 0, 1 ++ SAVELINE_W4 m1, 1, 1 ++ SAVELINE_W4 m2, 2, 1 ++ SAVELINE_W4 m3, 3, 1 ++ ;process low ++ RESTORELINE_W4 m2, 2, 0 ++ RESTORELINE_W4 m3, 3, 0 ++ palignr m4, m3, m2, 4;V 1 2 3 4 ++ punpcklwd m1, m2, m4 ; V 01 12 ++ punpckhwd m2, m4 ; V 23 34 ++ pshufd m0, m3, q2121;V 5 6 5 6 ++ punpcklwd m3, m0 ; V 45 56 ++.hv_w4_loop: ++ ;process low ++ pmaddwd m5, m1, subpelv0 ; V a0 b0 ++ mova m1, m2 ++ pmaddwd m2, subpelv1; V a1 b1 ++ paddd m5, m2 ++ mova m2, m3 ++ pmaddwd m3, subpelv2; V a2 b2 ++ paddd m5, m3 ++ ; ++ mova m6, [base+subpel_h_shuf4] ++ movq m4, [srcq+strideq*0] ; 7 ++ movhps m4, [srcq+strideq*1] ; 7 _ 8 _ ++ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ ++ pmaddubsw m4, m7 ;H subpel_filters ++ phaddw m4, m4 ;H 7 8 7 8 ++ pmulhrsw m4, w8192reg ;H pw_8192 ++ palignr m3, m4, m0, 12 ; 6 7 8 7 ++ mova m0, m4 ++ punpcklwd m3, m4 ; 67 78 ++ pmaddwd m4, m3, subpelv3; a3 b3 ++ paddd m5, d32reg ; pd_32 ++ paddd m5, m4 ++ psrad m5, 6 ++ SAVELINE_W4 m0, 0, 0 ++ SAVELINE_W4 m1, 1, 0 ++ SAVELINE_W4 m2, 2, 0 ++ SAVELINE_W4 m3, 3, 0 ++ SAVELINE_W4 m5, 5, 0 ++ ;process high ++ RESTORELINE_W4 m0, 0, 1 ++ RESTORELINE_W4 m1, 1, 1 ++ RESTORELINE_W4 m2, 2, 1 ++ RESTORELINE_W4 m3, 3, 1 ++ pmaddwd m5, m1, subpelv0; V a0 b0 ++ mova m1, m2 ++ pmaddwd m2, subpelv1; V a1 b1 ++ paddd m5, m2 ++ mova m2, m3 ++ pmaddwd m3, subpelv2; V a2 b2 ++ paddd m5, m3 ++ ; ++ mova m6, [base+subpel_h_shuf4+16] ++ movq m4, [srcq+strideq*0] ; 7 ++ movhps m4, [srcq+strideq*1] ; 7 _ 8 _ ++ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ ++ pmaddubsw m4, m7 ;H subpel_filters ++ phaddw m4, m4 ;H 7 8 7 8 ++ pmulhrsw m4, w8192reg ;H pw_8192 ++ palignr m3, m4, m0, 12 ; 6 7 8 7 ++ mova m0, m4 ++ punpcklwd m3, m4 ; 67 78 ++ pmaddwd m4, m3, subpelv3; a3 b3 ++ paddd m5, d32reg ; pd_32 ++ paddd m5, m4 ++ psrad m4, m5, 6 ++ ; ++ RESTORELINE_W4 m5, 5, 0 ++ packssdw m5, m4 ++ pshufd m5, m5, q3120 ++ movu [tmpq], m5 ++ lea srcq, [srcq+strideq*2] ++ add tmpq, 16 ++ sub hd, 2 ++ SAVELINE_W4 m0, 0, 1 ++ SAVELINE_W4 m1, 1, 1 ++ SAVELINE_W4 m2, 2, 1 ++ SAVELINE_W4 m3, 3, 1 ++ RESTORELINE_W4 m0, 0, 0 ++ RESTORELINE_W4 m1, 1, 0 ++ RESTORELINE_W4 m2, 2, 0 ++ RESTORELINE_W4 m3, 3, 0 ++ jg .hv_w4_loop ++ RET ++%undef subpelv0 ++%undef subpelv1 ++%undef subpelv2 ++%undef subpelv3 ++ ; ++ ++ ++.hv_w8: ++ %assign stack_offset org_stack_offset ++%define hv8_line_1 0 ++%define hv8_line_2 1 ++%define hv8_line_3 2 ++%define hv8_line_4 3 ++%define hv8_line_6 4 ++ shr mxd, 16 ++%if ARCH_X86_32 ++ %define base_reg r2 ++ %define subpelh0 [rsp+mmsize*5] ++ %define subpelh1 [rsp+mmsize*6] ++ %define subpelv0 [rsp+mmsize*7] ++ %define subpelv1 [rsp+mmsize*8] ++ %define subpelv2 [rsp+mmsize*9] ++ %define subpelv3 [rsp+mmsize*10] ++ %define accuv0 [rsp+mmsize*11] ++ %define accuv1 [rsp+mmsize*12] ++ movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3] ++ movzx mxd, myw ++ and mxd, 0xff ++ shr myd, 16 ++ cmp hd, 4 ++ cmovle myd, mxd ++ movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3] ++ ALLOC_STACK -mmsize*13 ++%if STACK_ALIGNMENT < mmsize ++ mov rstk, r2m ++ %define tmpm [rsp+mmsize*13+gprsize*1] ++ %define srcm [rsp+mmsize*13+gprsize*2] ++ %define stridem [rsp+mmsize*13+gprsize*3] ++ mov stridem, rstk ++%endif ++ mov r6, r2 ++%define base_reg r6 ++ pshufd m0, m1, q0000 ++ pshufd m1, m1, q1111 ++ punpcklbw m5, m5 ++ psraw m5, 8 ; sign-extend ++ pshufd m2, m5, q0000 ++ pshufd m3, m5, q1111 ++ pshufd m4, m5, q2222 ++ pshufd m5, m5, q3333 ++ mova subpelh0, m0 ++ mova subpelh1, m1 ++ mova subpelv0, m2 ++ mova subpelv1, m3 ++ mova subpelv2, m4 ++ mova subpelv3, m5 ++ W32_RESTORE_SSQ ++ lea strided, [strided*3] ++ sub srcd, strided ++ sub srcd, 3 ++ mov srcm, srcd ++ W32_RESTORE_SSQ ++%else ++ ALLOC_STACK mmsize*5, 16 ++ %define subpelh0 m10 ++ %define subpelh1 m11 ++ %define subpelv0 m12 ++ %define subpelv1 m13 ++ %define subpelv2 m14 ++ %define subpelv3 m15 ++ %define accuv0 m8 ++ %define accuv1 m9 ++ movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3] ++ movzx mxd, myb ++ shr myd, 16 ++ cmp hd, 4 ++ cmovle myd, mxd ++ movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3] ++ pshufd subpelh0, m0, q0000 ++ pshufd subpelh1, m0, q1111 ++ punpcklbw m1, m1 ++ psraw m1, 8 ; sign-extend ++ pshufd subpelv0, m1, q0000 ++ pshufd subpelv1, m1, q1111 ++ pshufd subpelv2, m1, q2222 ++ pshufd subpelv3, m1, q3333 ++ lea stride3q, [strideq*3] ++ sub srcq, 3 ++ sub srcq, stride3q ++ mov r6, srcq ++%endif ++ lea r5d, [wq-4] ++%if ARCH_X86_64 ++ mov r8, tmpq ++%else ++ mov tmpm, tmpq ++%endif ++ shl r5d, (16 - 2) ++ mov r5w, hw ++.hv_w8_loop0: ++ movu m4, [srcq+strideq*0] ; 0 = _ _ ++ movu m5, [srcq+strideq*1] ; 1 = _ _ ++ lea srcq, [srcq+strideq*2] ++%if ARCH_X86_64 ++ mova m7, [base+subpel_h_shufA] ++ mova m8, [base+subpel_h_shufB] ++ mova m9, [base+subpel_h_shufC] ++%endif ++ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ ++ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ ++ movu m6, [srcq+strideq*0] ; 2 = _ _ ++ movu m0, [srcq+strideq*1] ; 3 = _ _ ++ lea srcq, [srcq+strideq*2] ++ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ ++ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ ++ ; ++ mova m7, [base+pw_8192] ++ pmulhrsw m4, m7 ; H pw_8192 ++ pmulhrsw m5, m7 ; H pw_8192 ++ pmulhrsw m6, m7 ; H pw_8192 ++ pmulhrsw m0, m7 ; H pw_8192 ++ punpcklwd m1, m4, m5 ; 0 1 ~ ++ punpcklwd m2, m5, m6 ; 1 2 ~ ++ punpcklwd m3, m6, m0 ; 2 3 ~ ++ SAVELINE_W8 1, m1 ++ SAVELINE_W8 2, m2 ++ SAVELINE_W8 3, m3 ++ ; ++ mova m7, [base+subpel_h_shufA] ++ movu m4, [srcq+strideq*0] ; 4 = _ _ ++ movu m5, [srcq+strideq*1] ; 5 = _ _ ++ lea srcq, [srcq+strideq*2] ++ movu m6, [srcq+strideq*0] ; 6 = _ _ ++ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ ++ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ ++ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ ++ mova m7, [base+pw_8192] ++ pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ ++ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ ++ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ ++ punpcklwd m4, m0, m1 ; 3 4 ~ ++ punpcklwd m5, m1, m2 ; 4 5 ~ ++ punpcklwd m6, m2, m3 ; 5 6 ~ ++ ; ++ SAVELINE_W8 6, m3 ++ RESTORELINE_W8 1, m1 ++ RESTORELINE_W8 2, m2 ++ RESTORELINE_W8 3, m3 ++.hv_w8_loop: ++ ; m8 accu for V a ++ ; m9 accu for V b ++ SAVELINE_W8 1, m3 ++ SAVELINE_W8 2, m4 ++ SAVELINE_W8 3, m5 ++ SAVELINE_W8 4, m6 ++%if ARCH_X86_32 ++ pmaddwd m0, m1, subpelv0 ; a0 ++ pmaddwd m7, m2, subpelv0 ; b0 ++ pmaddwd m3, subpelv1 ; a1 ++ pmaddwd m4, subpelv1 ; b1 ++ paddd m0, m3 ++ paddd m7, m4 ++ pmaddwd m5, subpelv2 ; a2 ++ pmaddwd m6, subpelv2 ; b2 ++ paddd m0, m5 ++ paddd m7, m6 ++ mova m5, [base+pd_32] ++ paddd m0, m5 ; pd_512 ++ paddd m7, m5 ; pd_512 ++ mova accuv0, m0 ++ mova accuv1, m7 ++%else ++ pmaddwd m8, m1, subpelv0 ; a0 ++ pmaddwd m9, m2, subpelv0 ; b0 ++ pmaddwd m3, subpelv1 ; a1 ++ pmaddwd m4, subpelv1 ; b1 ++ paddd m8, m3 ++ paddd m9, m4 ++ pmaddwd m5, subpelv2 ; a2 ++ pmaddwd m6, subpelv2 ; b2 ++ paddd m8, m5 ++ paddd m9, m6 ++ mova m7, [base+pd_32] ++ paddd m8, m7 ; pd_512 ++ paddd m9, m7 ; pd_512 ++ mova m7, [base+subpel_h_shufB] ++ mova m6, [base+subpel_h_shufC] ++ mova m5, [base+subpel_h_shufA] ++%endif ++ movu m0, [srcq+strideq*1] ; 7 ++ movu m4, [srcq+strideq*2] ; 8 ++ lea srcq, [srcq+strideq*2] ++ HV_H_W8 m0, m1, m2, m3, m5, m7, m6 ++ HV_H_W8 m4, m1, m2, m3, m5, m7, m6 ++ mova m5, [base+pw_8192] ++ pmulhrsw m0, m5 ; H pw_8192 ++ pmulhrsw m4, m5 ; H pw_8192 ++ RESTORELINE_W8 6, m6 ++ punpcklwd m5, m6, m0 ; 6 7 ~ ++ punpcklwd m6, m0, m4 ; 7 8 ~ ++ pmaddwd m1, m5, subpelv3 ; a3 ++ paddd m2, m1, accuv0 ++ pmaddwd m1, m6, subpelv3 ; b3 ++ paddd m1, m1, accuv1 ; H + V ++ psrad m2, 6 ++ psrad m1, 6 ++ packssdw m2, m1 ; d -> w ++ movq [tmpq+wq*0], m2 ++ movhps [tmpq+wq*2], m2 ++ lea tmpq, [tmpq+wq*4] ++ sub hd, 2 ++ jle .hv_w8_outer ++ SAVELINE_W8 6, m4 ++ RESTORELINE_W8 1, m1 ++ RESTORELINE_W8 2, m2 ++ RESTORELINE_W8 3, m3 ++ RESTORELINE_W8 4, m4 ++ jmp .hv_w8_loop ++.hv_w8_outer: ++ movzx hd, r5w ++%if ARCH_X86_32 ++ add dword tmpm, 8 ++ mov tmpq, tmpm ++ mov srcq, srcm ++ add srcq, 4 ++ mov srcm, srcq ++%else ++ add r8, 8 ++ mov tmpq, r8 ++ add r6, 4 ++ mov srcq, r6 ++%endif ++ sub r5d, 1<<16 ++ jg .hv_w8_loop0 ++ RET ++ ++%if WIN64 ++DECLARE_REG_TMP 6, 4 ++%else ++DECLARE_REG_TMP 6, 7 ++%endif ++ ++%macro BIDIR_FN 1 ; op ++ %1 0 ++ lea stride3q, [strideq*3] ++ jmp wq ++.w4_loop: ++ %1_INC_PTR 2 ++ %1 0 ++ lea dstq, [dstq+strideq*4] ++.w4: ; tile 4x ++ movd [dstq ], m0 ; copy dw[0] ++ pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] ++ movd [dstq+strideq*1], m1 ; copy dw[1] ++ punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] ++ movd [dstq+strideq*2], m0 ; dw[2] ++ psrlq m0, 32 ; shift right in dw[3] ++ movd [dstq+stride3q ], m0 ; copy ++ sub hd, 4 ++ jg .w4_loop ++ RET ++.w8_loop: ++ %1_INC_PTR 2 ++ %1 0 ++ lea dstq, [dstq+strideq*2] ++.w8: ++ movq [dstq ], m0 ++ movhps [dstq+strideq*1], m0 ++ sub hd, 2 ++ jg .w8_loop ++ RET ++.w16_loop: ++ %1_INC_PTR 2 ++ %1 0 ++ lea dstq, [dstq+strideq] ++.w16: ++ mova [dstq ], m0 ++ dec hd ++ jg .w16_loop ++ RET ++.w32_loop: ++ %1_INC_PTR 4 ++ %1 0 ++ lea dstq, [dstq+strideq] ++.w32: ++ mova [dstq ], m0 ++ %1 2 ++ mova [dstq + 16 ], m0 ++ dec hd ++ jg .w32_loop ++ RET ++.w64_loop: ++ %1_INC_PTR 8 ++ %1 0 ++ add dstq, strideq ++.w64: ++ %assign i 0 ++ %rep 4 ++ mova [dstq + i*16 ], m0 ++ %assign i i+1 ++ %if i < 4 ++ %1 2*i ++ %endif ++ %endrep ++ dec hd ++ jg .w64_loop ++ RET ++.w128_loop: ++ %1_INC_PTR 16 ++ %1 0 ++ add dstq, strideq ++.w128: ++ %assign i 0 ++ %rep 8 ++ mova [dstq + i*16 ], m0 ++ %assign i i+1 ++ %if i < 8 ++ %1 2*i ++ %endif ++ %endrep ++ dec hd ++ jg .w128_loop ++ RET ++%endmacro ++ ++%macro AVG 1 ; src_offset ++ ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel ++ mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 ++ paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 ++ mova m1, [tmp1q+(%1+1)*mmsize] ++ paddw m1, [tmp2q+(%1+1)*mmsize] ++ pmulhrsw m0, m2 ++ pmulhrsw m1, m2 ++ packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit ++%endmacro ++ ++%macro AVG_INC_PTR 1 ++ add tmp1q, %1*mmsize ++ add tmp2q, %1*mmsize ++%endmacro ++ ++cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 + LEA r6, avg_ssse3_table + tzcnt wd, wm ; leading zeros + movifnidn hd, hm ; move h(stack) to h(register) if not already that register +@@ -840,16 +3428,17 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 + %macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 +- ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 +- mova m0, [tmp2q+(%1+0)*mmsize] +- psubw m2, m0, [tmp1q+(%1+0)*mmsize] +- mova m1, [tmp2q+(%1+1)*mmsize] +- psubw m3, m1, [tmp1q+(%1+1)*mmsize] +- paddw m2, m2 ; compensate for the weight only being half +- paddw m3, m3 ; of what it should be +- pmulhw m2, m4 ; (b-a) * (-weight << 12) +- pmulhw m3, m4 ; (b-a) * (-weight << 12) +- paddw m0, m2 ; ((b-a) * -weight) + b ++ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ++ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 ++ mova m2, [tmp1q+(%1+0)*mmsize] ++ mova m0, m2 ++ psubw m2, [tmp2q+(%1+0)*mmsize] ++ mova m3, [tmp1q+(%1+1)*mmsize] ++ mova m1, m3 ++ psubw m3, [tmp2q+(%1+1)*mmsize] ++ pmulhw m2, m4 ++ pmulhw m3, m4 ++ paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 +@@ -861,16 +3450,22 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 + cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 + LEA r6, w_avg_ssse3_table + tzcnt wd, wm ++ movd m4, r6m + movifnidn hd, hm +- movd m0, r6m +- pshuflw m0, m0, q0000 +- punpcklqdq m0, m0 ++ pxor m0, m0 + movsxd wq, dword [r6+wq*4] +- pxor m4, m4 +- psllw m0, 11 ; can't shift by 12, sign bit must be preserved +- psubw m4, m0 + mova m5, [pw_2048+r6-w_avg_ssse3_table] ++ pshufb m4, m0 ++ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 ++ cmp dword r6m, 7 ++ jg .weight_gt7 ++ mov r6, tmp1q ++ psubw m0, m4 ++ mov tmp1q, tmp2q ++ mova m4, m0 ; -weight ++ mov tmp2q, r6 ++.weight_gt7: + BIDIR_FN W_AVG + + %macro MASK 1 ; src_offset +@@ -923,41 +3518,34 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 + BIDIR_FN MASK + %undef hd + +-%if ARCH_X86_64 +- %define reg_pw_8 m8 +- %define reg_pw_27 m9 +- %define reg_pw_2048 m10 +-%else +- %define reg_pw_8 [base+pw_8] +- %define reg_pw_27 [base+pw_26] ; 64 - 38 +- %define reg_pw_2048 [base+pw_2048] +-%endif +- + %macro W_MASK_420_B 2 ; src_offset in bytes, mask_out + ;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] **** + mova m0, [tmp1q+(%1)] + mova m1, [tmp2q+(%1)] +- psubw m1, m0 ; tmp1 - tmp2 +- pabsw m3, m1 ; abs(tmp1 - tmp2) +- paddw m3, reg_pw_8 ; abs(tmp1 - tmp2) + 8 +- psrlw m3, 8 ; (abs(tmp1 - tmp2) + 8) >> 8 +- psubusw m%2, reg_pw_27, m3 ; 64 - min(m, 64) +- psllw m2, m%2, 10 ++ mova m2, reg_pw_6903 ++ psubw m1, m0 ++ pabsw m%2, m1 ; abs(tmp1 - tmp2) ++ mova m3, m2 ++ psubusw m2, m%2 ++ psrlw m2, 8 ; 64 - m ++ mova m%2, m2 ++ psllw m2, 10 + pmulhw m1, m2 ; tmp2 * () + paddw m0, m1 ; tmp1 + () + ;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] **** + mova m1, [tmp1q+(%1)+mmsize] + mova m2, [tmp2q+(%1)+mmsize] +- psubw m2, m1 ; tmp1 - tmp2 ++ psubw m2, m1 + pabsw m7, m2 ; abs(tmp1 - tmp2) +- paddw m7, reg_pw_8 ; abs(tmp1 - tmp2) + 8 +- psrlw m7, 8 ; (abs(tmp1 - tmp2) + 8) >> 8 +- psubusw m3, reg_pw_27, m7 ; 64 - min(m, 64) ++ psubusw m3, m7 ++ psrlw m3, 8 ; 64 - m + phaddw m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0] + psllw m3, 10 + pmulhw m2, m3 ++%if ARCH_X86_32 ++ mova reg_pw_2048, [base+pw_2048] ++%endif + paddw m1, m2 +- ;******** + pmulhrsw m0, reg_pw_2048 ; round/scale 2048 + pmulhrsw m1, reg_pw_2048 ; round/scale 2048 + packuswb m0, m1 ; concat m0 = u8.dst[15..0] +@@ -969,38 +3557,41 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 + + %define base r6-w_mask_420_ssse3_table + %if ARCH_X86_64 ++%define reg_pw_6903 m8 ++%define reg_pw_2048 m9 + ; args: dst, stride, tmp1, tmp2, w, h, mask, sign +-cglobal w_mask_420, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask ++cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask + lea r6, [w_mask_420_ssse3_table] + mov wd, wm + tzcnt r7d, wd ++ movd m0, r7m ; sign + movifnidn hd, hm +- movd m0, r7m +- pshuflw m0, m0, q0000 ; sign +- punpcklqdq m0, m0 + movsxd r7, [r6+r7*4] +- mova reg_pw_8, [base+pw_8] +- mova reg_pw_27, [base+pw_26] ; 64 - 38 ++ mova reg_pw_6903, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + mova reg_pw_2048, [base+pw_2048] +- mova m6, [base+pw_258] ; 64 * 4 + 2 ++ movd m6, [base+pw_258] ; 64 * 4 + 2 + add r7, r6 + mov maskq, maskmp + psubw m6, m0 ++ pshuflw m6, m6, q0000 ++ punpcklqdq m6, m6 + W_MASK_420 0, 4 + jmp r7 + %define loop_w r7d + %else ++%define reg_pw_6903 [base+pw_6903] ++%define reg_pw_2048 m3 + cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask + tzcnt wd, wm + LEA r6, w_mask_420_ssse3_table +- mov wd, [r6+wq*4] ++ movd m0, r7m ; sign + mov maskq, r6mp +- movd m0, r7m +- pshuflw m0, m0, q0000 ; sign +- punpcklqdq m0, m0 +- mova m6, [base+pw_258] ; 64 * 4 + 2 ++ mov wd, [r6+wq*4] ++ movd m6, [base+pw_258] + add wq, r6 + psubw m6, m0 ++ pshuflw m6, m6, q0000 ++ punpcklqdq m6, m6 + W_MASK_420 0, 4 + jmp wd + %define loop_w dword r0m +@@ -1021,12 +3612,12 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask + movd [dstq+strideq*0], m0 ; copy m0[2] + psrlq m0, 32 + movd [dstq+strideq*1], m0 ; copy m0[3] +- pshufd m5, m4, q3131; DBDB even lines repeated +- pshufd m4, m4, q2020; CACA odd lines repeated +- psubw m1, m6, m4 ; m9 == 64 * 4 + 2 +- psubw m1, m5 ; C-D A-B C-D A-B +- psrlw m1, 2 ; >> 2 ++ psubw m1, m6, m4 ; a _ c _ ++ psrlq m4, 32 ; b _ d _ ++ psubw m1, m4 ++ psrlw m1, 2 + packuswb m1, m1 ++ pshuflw m1, m1, q2020 + movd [maskq], m1 + sub hd, 4 + jg .w4_loop +@@ -1040,9 +3631,9 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask + .w8: + movq [dstq ], m0 + movhps [dstq+strideq*1], m0 +- pshufd m1, m4, q3232 + psubw m0, m6, m4 +- psubw m0, m1 ++ punpckhqdq m4, m4 ++ psubw m0, m4 + psrlw m0, 2 + packuswb m0, m0 + movd [maskq], m0 +@@ -1082,8 +3673,7 @@ cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask + jg .w16ge_loop + RET + +-%undef reg_pw_8 +-%undef reg_pw_27 ++%undef reg_pw_6903 + %undef reg_pw_2048 + %undef dst_bak + %undef loop_w +diff --git third_party/dav1d/tests/checkasm/cdef.c third_party/dav1d/tests/checkasm/cdef.c +index 85f9b71ea9b1..4d444c54b807 100644 +--- third_party/dav1d/tests/checkasm/cdef.c ++++ third_party/dav1d/tests/checkasm/cdef.c +@@ -53,9 +53,6 @@ static void check_cdef_filter(const cdef_fn fn, const int w, const int h, + if (check_func(fn, "%s_%dbpc", name, BITDEPTH)) { + for (int dir = 0; dir < 8; dir++) { + for (enum CdefEdgeFlags edges = 0; edges <= 0xf; edges++) { +- memcpy(a_src, src, (10 * 16 + 8) * sizeof(pixel)); +- memcpy(c_src, src, (10 * 16 + 8) * sizeof(pixel)); +- + #if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; + #else +@@ -66,8 +63,11 @@ static void check_cdef_filter(const cdef_fn fn, const int w, const int h, + init_tmp(top, 16 * 2 + 8, bitdepth_max); + init_tmp((pixel *) left,8 * 2, bitdepth_max); + ++ memcpy(a_src, src, (10 * 16 + 8) * sizeof(pixel)); ++ memcpy(c_src, src, (10 * 16 + 8) * sizeof(pixel)); ++ + const int lvl = 1 + (rnd() % 62); +- const int damping = 3 + (rnd() & 3) + bitdepth_min_8; ++ const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1)); + const int pri_strength = (lvl >> 2) << bitdepth_min_8; + int sec_strength = lvl & 3; + sec_strength += sec_strength == 3; +diff --git third_party/dav1d/tests/checkasm/checkasm.c third_party/dav1d/tests/checkasm/checkasm.c +index 98c0e4505605..c9908b81793d 100644 +--- third_party/dav1d/tests/checkasm/checkasm.c ++++ third_party/dav1d/tests/checkasm/checkasm.c +@@ -53,7 +53,7 @@ static unsigned get_seed(void) { + static unsigned get_seed(void) { + struct timeval tv; + gettimeofday(&tv, NULL); +- return tv.tv_usec + tv.tv_sec * 1000000; ++ return (unsigned) (tv.tv_usec + tv.tv_sec * 1000000); + } + #endif + +@@ -127,7 +127,7 @@ static struct { + CheckasmFuncVersion *current_func_ver; + const char *current_test_name; + const char *bench_pattern; +- int bench_pattern_len; ++ size_t bench_pattern_len; + int num_checked; + int num_failed; + int nop_time; +@@ -325,7 +325,7 @@ static int measure_nop_time(void) { + + for (i = 0; i < 10000; i++) { + uint64_t t = readtime(); +- nops[i] = readtime() - t; ++ nops[i] = (uint16_t) (readtime() - t); + } + + qsort(nops, 10000, sizeof(uint16_t), cmp_nop); +@@ -345,8 +345,8 @@ static void print_benchs(const CheckasmFunc *const f) { + const CheckasmFuncVersion *v = &f->versions; + do { + if (v->iterations) { +- int decicycles = (10*v->cycles/v->iterations - +- state.nop_time) / 4; ++ int decicycles = (int) (10*v->cycles/v->iterations - ++ state.nop_time) / 4; + printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu), + decicycles/10, decicycles%10); + } +@@ -420,7 +420,7 @@ static CheckasmFunc *get_func(CheckasmFunc **root, const char *const name) { + } + } else { + /* Allocate and insert a new node into the tree */ +- const int name_length = strlen(name); ++ const size_t name_length = strlen(name); + f = *root = checkasm_malloc(sizeof(CheckasmFunc) + name_length); + memcpy(f->name, name, name_length + 1); + } +@@ -521,7 +521,7 @@ int main(int argc, char *argv[]) { + fprintf(stderr, "]\n"); + return 0; + } else { +- state.seed = strtoul(argv[1], NULL, 10); ++ state.seed = (unsigned int) strtoul(argv[1], NULL, 10); + } + + argc--; +@@ -636,10 +636,11 @@ void checkasm_update_bench(const int iterations, const uint64_t cycles) { + /* Print the outcome of all tests performed since + * the last time this function was called */ + void checkasm_report(const char *const name, ...) { +- static int prev_checked, prev_failed, max_length; ++ static int prev_checked, prev_failed; ++ static size_t max_length; + + if (state.num_checked > prev_checked) { +- int pad_length = max_length + 4; ++ int pad_length = (int) max_length + 4; + va_list arg; + + print_cpu_name(); +@@ -660,7 +661,7 @@ void checkasm_report(const char *const name, ...) { + } else if (!state.cpu_flag) { + /* Calculate the amount of padding required + * to make the output vertically aligned */ +- int length = strlen(state.current_test_name); ++ size_t length = strlen(state.current_test_name); + va_list arg; + + va_start(arg, name); +diff --git third_party/dav1d/tests/checkasm/checkasm.h third_party/dav1d/tests/checkasm/checkasm.h +index 018fc9e5c628..7adc40cf4bf6 100644 +--- third_party/dav1d/tests/checkasm/checkasm.h ++++ third_party/dav1d/tests/checkasm/checkasm.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_TESTS_CHECKASM_CHECKASM_H +-#define __DAV1D_TESTS_CHECKASM_CHECKASM_H ++#ifndef DAV1D_TESTS_CHECKASM_CHECKASM_H ++#define DAV1D_TESTS_CHECKASM_CHECKASM_H + + #include "config.h" + +@@ -262,4 +262,4 @@ void checkasm_stack_clobber(uint64_t clobber, ...); + #define bench_new(...) while (0) + #endif + +-#endif /* __DAV1D_TESTS_CHECKASM_CHECKASM_H */ ++#endif /* DAV1D_TESTS_CHECKASM_CHECKASM_H */ +diff --git third_party/dav1d/tests/checkasm/itx.c third_party/dav1d/tests/checkasm/itx.c +index a090aa69b611..9254491f2552 100644 +--- third_party/dav1d/tests/checkasm/itx.c ++++ third_party/dav1d/tests/checkasm/itx.c +@@ -215,7 +215,7 @@ static int ftx(coef *const buf, const enum RectTxfmSize tx, + + for (int y = 0; y < sh; y++) + for (int x = 0; x < sw; x++) +- buf[y * sw + x] = out[y * w + x] + 0.5; ++ buf[y * sw + x] = (coef) (out[y * w + x] + 0.5); + + return copy_subcoefs(buf, tx, txtp, sw, sh, subsh); + } +diff --git third_party/dav1d/tests/checkasm/mc.c third_party/dav1d/tests/checkasm/mc.c +index c84a64fd1359..fd26386ce897 100644 +--- third_party/dav1d/tests/checkasm/mc.c ++++ third_party/dav1d/tests/checkasm/mc.c +@@ -46,13 +46,15 @@ static void check_mc(Dav1dMCDSPContext *const c) { + ALIGN_STK_32(pixel, c_dst, 128 * 128,); + ALIGN_STK_32(pixel, a_dst, 128 * 128,); + const pixel *src = src_buf + 135 * 3 + 3; ++ const ptrdiff_t src_stride = 135 * sizeof(pixel); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, + ptrdiff_t src_stride, int w, int h, int mx, int my + HIGHBD_DECL_SUFFIX); + + for (int filter = 0; filter < N_2D_FILTERS; filter++) +- for (int w = 2; w <= 128; w <<= 1) ++ for (int w = 2; w <= 128; w <<= 1) { ++ const ptrdiff_t dst_stride = w * sizeof(pixel); + for (int mxy = 0; mxy < 4; mxy++) + if (check_func(c->mc[filter], "mc_%s_w%d_%s_%dbpc", + filter_names[filter], w, mxy_names[mxy], BITDEPTH)) +@@ -71,24 +73,42 @@ static void check_mc(Dav1dMCDSPContext *const c) { + for (int i = 0; i < 135 * 135; i++) + src_buf[i] = rnd() & bitdepth_max; + +- call_ref(c_dst, w, src, w, w, h, mx, my HIGHBD_TAIL_SUFFIX); +- call_new(a_dst, w, src, w, w, h, mx, my HIGHBD_TAIL_SUFFIX); ++ call_ref(c_dst, dst_stride, src, src_stride, w, h, ++ mx, my HIGHBD_TAIL_SUFFIX); ++ call_new(a_dst, dst_stride, src, src_stride, w, h, ++ mx, my HIGHBD_TAIL_SUFFIX); + if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst))) + fail(); + + if (filter == FILTER_2D_8TAP_REGULAR || + filter == FILTER_2D_BILINEAR) +- bench_new(a_dst, w, src, w, w, h, mx, my HIGHBD_TAIL_SUFFIX); ++ { ++ bench_new(a_dst, dst_stride, src, src_stride, w, h, ++ mx, my HIGHBD_TAIL_SUFFIX); ++ } + } + } ++ } + report("mc"); + } + ++/* Generate worst case input in the topleft corner, randomize the rest */ ++static void generate_mct_input(pixel *const buf, const int bitdepth_max) { ++ static const int8_t pattern[8] = { -1, 0, -1, 0, 0, -1, 0, -1 }; ++ const int sign = -(rnd() & 1); ++ ++ for (int y = 0; y < 135; y++) ++ for (int x = 0; x < 135; x++) ++ buf[135*y+x] = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign) ++ : rnd()) & bitdepth_max; ++} ++ + static void check_mct(Dav1dMCDSPContext *const c) { + ALIGN_STK_32(pixel, src_buf, 135 * 135,); +- ALIGN_STK_32(int16_t, c_tmp, 128 * 128,); +- ALIGN_STK_32(int16_t, a_tmp, 128 * 128,); ++ ALIGN_STK_32(int16_t, c_tmp, 128 * 128,); ++ ALIGN_STK_32(int16_t, a_tmp, 128 * 128,); + const pixel *src = src_buf + 135 * 3 + 3; ++ const ptrdiff_t src_stride = 135 * sizeof(pixel); + + declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + int w, int h, int mx, int my HIGHBD_DECL_SUFFIX); +@@ -107,18 +127,21 @@ static void check_mct(Dav1dMCDSPContext *const c) { + #else + const int bitdepth_max = 0xff; + #endif ++ generate_mct_input(src_buf, bitdepth_max); + +- for (int i = 0; i < 135 * 135; i++) +- src_buf[i] = rnd() & bitdepth_max; +- +- call_ref(c_tmp, src, w, w, h, mx, my HIGHBD_TAIL_SUFFIX); +- call_new(a_tmp, src, w, w, h, mx, my HIGHBD_TAIL_SUFFIX); ++ call_ref(c_tmp, src, src_stride, w, h, ++ mx, my HIGHBD_TAIL_SUFFIX); ++ call_new(a_tmp, src, src_stride, w, h, ++ mx, my HIGHBD_TAIL_SUFFIX); + if (memcmp(c_tmp, a_tmp, w * h * sizeof(*c_tmp))) + fail(); + + if (filter == FILTER_2D_8TAP_REGULAR || + filter == FILTER_2D_BILINEAR) +- bench_new(a_tmp, src, w, w, h, mx, my HIGHBD_TAIL_SUFFIX); ++ { ++ bench_new(a_tmp, src, src_stride, w, h, ++ mx, my HIGHBD_TAIL_SUFFIX); ++ } + } + report("mct"); + } +@@ -127,12 +150,10 @@ static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf, + int16_t (*const tmp)[128 * 128], const int bitdepth_max) + { + for (int i = 0; i < 2; i++) { +- for (int j = 0; j < 135 * 135; j++) +- buf[j] = rnd() & bitdepth_max; +- c->mct[rnd() % N_2D_FILTERS](tmp[i], buf + 135 * 3 + 3, +- 128 * sizeof(pixel), 128, 128, +- rnd() & 15, rnd() & 15 +- HIGHBD_TAIL_SUFFIX); ++ generate_mct_input(buf, bitdepth_max); ++ c->mct[FILTER_2D_8TAP_SHARP](tmp[i], buf + 135 * 3 + 3, ++ 135 * sizeof(pixel), 128, 128, ++ 8, 8 HIGHBD_TAIL_SUFFIX); + } + } + +@@ -145,7 +166,8 @@ static void check_avg(Dav1dMCDSPContext *const c) { + const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX); + + for (int w = 4; w <= 128; w <<= 1) +- if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) ++ if (check_func(c->avg, "avg_w%d_%dbpc", w, BITDEPTH)) { ++ ptrdiff_t dst_stride = w * sizeof(pixel); + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) + { + #if BITDEPTH == 16 +@@ -153,14 +175,16 @@ static void check_avg(Dav1dMCDSPContext *const c) { + #else + const int bitdepth_max = 0xff; + #endif ++ + init_tmp(c, c_dst, tmp, bitdepth_max); +- call_ref(c_dst, w, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); +- call_new(a_dst, w, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); ++ call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); ++ call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); + if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst))) + fail(); + +- bench_new(a_dst, w, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); ++ bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h HIGHBD_TAIL_SUFFIX); + } ++ } + report("avg"); + } + +@@ -173,7 +197,8 @@ static void check_w_avg(Dav1dMCDSPContext *const c) { + const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX); + + for (int w = 4; w <= 128; w <<= 1) +- if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) ++ if (check_func(c->w_avg, "w_avg_w%d_%dbpc", w, BITDEPTH)) { ++ ptrdiff_t dst_stride = w * sizeof(pixel); + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) + { + int weight = rnd() % 15 + 1; +@@ -184,13 +209,14 @@ static void check_w_avg(Dav1dMCDSPContext *const c) { + #endif + init_tmp(c, c_dst, tmp, bitdepth_max); + +- call_ref(c_dst, w, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); +- call_new(a_dst, w, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); ++ call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); ++ call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); + if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst))) + fail(); + +- bench_new(a_dst, w, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); ++ bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, weight HIGHBD_TAIL_SUFFIX); + } ++ } + report("w_avg"); + } + +@@ -208,7 +234,8 @@ static void check_mask(Dav1dMCDSPContext *const c) { + HIGHBD_DECL_SUFFIX); + + for (int w = 4; w <= 128; w <<= 1) +- if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) ++ if (check_func(c->mask, "mask_w%d_%dbpc", w, BITDEPTH)) { ++ ptrdiff_t dst_stride = w * sizeof(pixel); + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) + { + #if BITDEPTH == 16 +@@ -217,13 +244,14 @@ static void check_mask(Dav1dMCDSPContext *const c) { + const int bitdepth_max = 0xff; + #endif + init_tmp(c, c_dst, tmp, bitdepth_max); +- call_ref(c_dst, w, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); +- call_new(a_dst, w, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); ++ call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); ++ call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); + if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst))) + fail(); + +- bench_new(a_dst, w, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); ++ bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, mask HIGHBD_TAIL_SUFFIX); + } ++ } + report("mask"); + } + +@@ -244,6 +272,8 @@ static void check_w_mask(Dav1dMCDSPContext *const c) { + for (int w = 4; w <= 128; w <<= 1) + if (check_func(c->w_mask[i], "w_mask_%d_w%d_%dbpc", ss[i], w, + BITDEPTH)) ++ { ++ ptrdiff_t dst_stride = w * sizeof(pixel); + for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1) + { + int sign = rnd() & 1; +@@ -254,19 +284,20 @@ static void check_w_mask(Dav1dMCDSPContext *const c) { + #endif + init_tmp(c, c_dst, tmp, bitdepth_max); + +- call_ref(c_dst, w, tmp[0], tmp[1], w, h, c_mask, sign +- HIGHBD_TAIL_SUFFIX); +- call_new(a_dst, w, tmp[0], tmp[1], w, h, a_mask, sign +- HIGHBD_TAIL_SUFFIX); ++ call_ref(c_dst, dst_stride, tmp[0], tmp[1], w, h, ++ c_mask, sign HIGHBD_TAIL_SUFFIX); ++ call_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, ++ a_mask, sign HIGHBD_TAIL_SUFFIX); + if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)) || + memcmp(c_mask, a_mask, (w * h * sizeof(*c_mask)) >> i)) + { + fail(); + } + +- bench_new(a_dst, w, tmp[0], tmp[1], w, h, a_mask, sign +- HIGHBD_TAIL_SUFFIX); ++ bench_new(a_dst, dst_stride, tmp[0], tmp[1], w, h, ++ a_mask, sign HIGHBD_TAIL_SUFFIX); + } ++ } + report("w_mask"); + } + +diff --git third_party/dav1d/tests/libfuzzer/alloc_fail.c third_party/dav1d/tests/libfuzzer/alloc_fail.c +index 50b2c4b86735..ddd1dd71abaa 100644 +--- third_party/dav1d/tests/libfuzzer/alloc_fail.c ++++ third_party/dav1d/tests/libfuzzer/alloc_fail.c +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + #include "alloc_fail.h" + +@@ -63,3 +64,39 @@ int __wrap_posix_memalign(void **memptr, size_t alignment, size_t size) { + #else + #error "HAVE_POSIX_MEMALIGN required" + #endif ++ ++int __wrap_pthread_create(pthread_t *, const pthread_attr_t *, ++ void *(*) (void *), void *); ++ ++int __wrap_pthread_create(pthread_t *thread, const pthread_attr_t *attr, ++ void *(*start_routine) (void *), void *arg) ++{ ++ if (rand() < (fail_probability + RAND_MAX/16)) ++ return EAGAIN; ++ ++ return pthread_create(thread, attr, start_routine, arg); ++} ++ ++int __wrap_pthread_mutex_init(pthread_mutex_t *, ++ const pthread_mutexattr_t *); ++ ++int __wrap_pthread_mutex_init(pthread_mutex_t *restrict mutex, ++ const pthread_mutexattr_t *restrict attr) ++{ ++ if (rand() < (fail_probability + RAND_MAX/8)) ++ return ENOMEM; ++ ++ return pthread_mutex_init(mutex, attr); ++} ++ ++int __wrap_pthread_cond_init(pthread_cond_t *, ++ const pthread_condattr_t *); ++ ++int __wrap_pthread_cond_init(pthread_cond_t *restrict cond, ++ const pthread_condattr_t *restrict attr) ++{ ++ if (rand() < (fail_probability + RAND_MAX/16)) ++ return ENOMEM; ++ ++ return pthread_cond_init(cond, attr); ++} +diff --git third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h +index 201d056ea576..5d9329973e9f 100644 +--- third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h ++++ third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h +@@ -25,12 +25,12 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H +-#define __DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H ++#ifndef DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H ++#define DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H + + #include + #include + + int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size); + +-#endif /* __DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H*/ ++#endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */ +diff --git third_party/dav1d/tests/libfuzzer/main.c third_party/dav1d/tests/libfuzzer/main.c +index fa8a871df2a7..985ebba4cf31 100644 +--- third_party/dav1d/tests/libfuzzer/main.c ++++ third_party/dav1d/tests/libfuzzer/main.c +@@ -25,7 +25,10 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + ++#include "config.h" ++ + #include ++#include + #include + #include + #include +@@ -40,7 +43,7 @@ + int main(const int argc, char *const *const argv) { + int ret = -1; + FILE *f = NULL; +- long fsize; ++ int64_t fsize; + const char *filename = NULL; + uint8_t *data = NULL; + size_t size = 0; +@@ -56,22 +59,22 @@ int main(const int argc, char *const *const argv) { + goto error; + } + +- if (fseek(f, 0L, SEEK_END) == -1) { ++ if (fseeko(f, 0, SEEK_END) == -1) { + fprintf(stderr, "fseek(%s, 0, SEEK_END) failed: %s\n", filename, + strerror(errno)); + goto error; + } +- if ((fsize = ftell(f)) == -1) { ++ if ((fsize = ftello(f)) == -1) { + fprintf(stderr, "ftell(%s) failed: %s\n", filename, strerror(errno)); + goto error; + } + rewind(f); + + if (fsize < 0 || fsize > INT_MAX) { +- fprintf(stderr, "%s is too large: %ld\n", filename, fsize); ++ fprintf(stderr, "%s is too large: %"PRId64"\n", filename, fsize); + goto error; + } +- size = fsize; ++ size = (size_t)fsize; + + if (!(data = malloc(size))) { + fprintf(stderr, "failed to allocate: %zu bytes\n", size); +@@ -83,7 +86,7 @@ int main(const int argc, char *const *const argv) { + filename, strerror(errno)); + goto error; + } +- ++ + ret = LLVMFuzzerTestOneInput(data, size); + + error: +diff --git third_party/dav1d/tests/meson.build third_party/dav1d/tests/meson.build +index 16da420391f1..dc0cc10f726d 100644 +--- third_party/dav1d/tests/meson.build ++++ third_party/dav1d/tests/meson.build +@@ -138,6 +138,9 @@ if (objcopy.found() and + command: [objcopy, + '--redefine-sym', 'malloc=__wrap_malloc', + '--redefine-sym', 'posix_memalign=__wrap_posix_memalign', ++ '--redefine-sym', 'pthread_create=__wrap_pthread_create', ++ '--redefine-sym', 'pthread_cond_init=__wrap_pthread_cond_init', ++ '--redefine-sym', 'pthread_mutex_init=__wrap_pthread_mutex_init', + '@INPUT@', '@OUTPUT@']) + + dav1d_fuzzer_mem = executable('dav1d_fuzzer_mem', +diff --git third_party/dav1d/tools/dav1d.c third_party/dav1d/tools/dav1d.c +index 736ae3a33d5c..8edcb7568bd6 100644 +--- third_party/dav1d/tools/dav1d.c ++++ third_party/dav1d/tools/dav1d.c +@@ -26,7 +26,7 @@ + */ + + #include "config.h" +-#include "version.h" ++#include "vcs_version.h" + + #include + #include +@@ -36,8 +36,11 @@ + #ifdef HAVE_UNISTD_H + # include + #endif ++#ifdef HAVE_IO_H ++# include ++#endif + +-#include "dav1d/data.h" ++#include "dav1d/dav1d.h" + + #include "input/input.h" + +@@ -97,7 +100,7 @@ int main(const int argc, char *const *const argv) { + } + + if (!cli_settings.quiet) +- fprintf(stderr, "dav1d %s - by VideoLAN\n", DAV1D_VERSION); ++ fprintf(stderr, "dav1d %s - by VideoLAN\n", dav1d_version()); + + // skip frames until a sequence header is found + if (cli_settings.skip) { +diff --git third_party/dav1d/tools/dav1d_cli_parse.c third_party/dav1d/tools/dav1d_cli_parse.c +index a51af8401aac..b364ca3758ca 100644 +--- third_party/dav1d/tools/dav1d_cli_parse.c ++++ third_party/dav1d/tools/dav1d_cli_parse.c +@@ -104,7 +104,7 @@ static void usage(const char *const app, const char *const reason, ...) { + " --version/-v: print version and exit\n" + " --framethreads $num: number of frame threads (default: 1)\n" + " --tilethreads $num: number of tile threads (default: 1)\n" +- " --filmgrain enable film grain application (default: 1, except if muxer is md5)\n" ++ " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n" + " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 32)\n" + " --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n" + " --verify $md5: verify decoded md5. implies --muxer md5, no output\n" +@@ -134,7 +134,7 @@ static void error(const char *const app, const char *const optarg, + + static unsigned parse_unsigned(char *optarg, const int option, const char *app) { + char *end; +- const unsigned res = strtoul(optarg, &end, 0); ++ const unsigned res = (unsigned) strtoul(optarg, &end, 0); + if (*end || end == optarg) error(app, optarg, option, "an integer"); + return res; + } +@@ -193,9 +193,9 @@ static unsigned parse_enum(char *optarg, const EnumParseTable *const tbl, + char *end; + unsigned res; + if (!strncmp(optarg, "0x", 2)) { +- res = strtoul(&optarg[2], &end, 16); ++ res = (unsigned) strtoul(&optarg[2], &end, 16); + } else { +- res = strtoul(optarg, &end, 0); ++ res = (unsigned) strtoul(optarg, &end, 0); + } + + if (*end || end == optarg) { +diff --git third_party/dav1d/tools/dav1d_cli_parse.h third_party/dav1d/tools/dav1d_cli_parse.h +index d7c78f9be134..899f207ce4ad 100644 +--- third_party/dav1d/tools/dav1d_cli_parse.h ++++ third_party/dav1d/tools/dav1d_cli_parse.h +@@ -25,10 +25,10 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_CLI_PARSE_H__ +-#define __DAV1D_CLI_PARSE_H__ ++#ifndef DAV1D_CLI_PARSE_H ++#define DAV1D_CLI_PARSE_H + +-#include "dav1d.h" ++#include "dav1d/dav1d.h" + + typedef struct { + const char *outputfile; +@@ -43,4 +43,4 @@ typedef struct { + void parse(const int argc, char *const *const argv, + CLISettings *const cli_settings, Dav1dSettings *const lib_settings); + +-#endif /* __DAV1D_CLI_PARSE_H__ */ ++#endif /* DAV1D_CLI_PARSE_H */ +diff --git third_party/dav1d/tools/input/annexb.c third_party/dav1d/tools/input/annexb.c +index 5b3a986864fc..11dd14b9127d 100644 +--- third_party/dav1d/tools/input/annexb.c ++++ third_party/dav1d/tools/input/annexb.c +@@ -77,9 +77,9 @@ static int annexb_open(AnnexbInputContext *const c, const char *const file, + res = leb128(c, &len); + if (res < 0) + break; +- fseek(c->f, len, SEEK_CUR); ++ fseeko(c->f, len, SEEK_CUR); + } +- fseek(c->f, 0, SEEK_SET); ++ fseeko(c->f, 0, SEEK_SET); + + return 0; + } +@@ -103,7 +103,7 @@ static int annexb_read(AnnexbInputContext *const c, Dav1dData *const data) { + if (!ptr) return -1; + c->temporal_unit_size -= len + res; + c->frame_unit_size -= len + res; +- if ((res = fread(ptr, len, 1, c->f)) != 1) { ++ if (fread(ptr, len, 1, c->f) != 1) { + fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno)); + dav1d_data_unref(data); + return -1; +diff --git third_party/dav1d/tools/input/demuxer.h third_party/dav1d/tools/input/demuxer.h +index 8fdf6ed3b229..96456d765a8f 100644 +--- third_party/dav1d/tools/input/demuxer.h ++++ third_party/dav1d/tools/input/demuxer.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_INPUT_DEMUXER_H__ +-#define __DAV1D_INPUT_DEMUXER_H__ ++#ifndef DAV1D_INPUT_DEMUXER_H ++#define DAV1D_INPUT_DEMUXER_H + + #include "data.h" + +@@ -41,4 +41,4 @@ typedef struct Demuxer { + void (*close)(DemuxerPriv *ctx); + } Demuxer; + +-#endif /* __DAV1D_INPUT_DEMUXER_H__ */ ++#endif /* DAV1D_INPUT_DEMUXER_H */ +diff --git third_party/dav1d/tools/input/input.c third_party/dav1d/tools/input/input.c +index d4d8ad712dac..7837e46666f2 100644 +--- third_party/dav1d/tools/input/input.c ++++ third_party/dav1d/tools/input/input.c +@@ -57,7 +57,7 @@ void init_demuxers(void) { + } + + static const char *find_extension(const char *const f) { +- const int l = strlen(f); ++ const size_t l = strlen(f); + + if (l == 0) return NULL; + +diff --git third_party/dav1d/tools/input/input.h third_party/dav1d/tools/input/input.h +index 67ce5474ffb3..9bd2982a5a76 100644 +--- third_party/dav1d/tools/input/input.h ++++ third_party/dav1d/tools/input/input.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_INPUT_INPUT_H__ +-#define __DAV1D_INPUT_INPUT_H__ ++#ifndef DAV1D_INPUT_INPUT_H ++#define DAV1D_INPUT_INPUT_H + + #include "data.h" + +@@ -39,4 +39,4 @@ int input_open(DemuxerContext **const c_out, + int input_read(DemuxerContext *ctx, Dav1dData *data); + void input_close(DemuxerContext *ctx); + +-#endif /* __DAV1D_INPUT_INPUT_H__ */ ++#endif /* DAV1D_INPUT_INPUT_H */ +diff --git third_party/dav1d/tools/input/ivf.c third_party/dav1d/tools/input/ivf.c +index 9e9c415d5280..495429d92ef3 100644 +--- third_party/dav1d/tools/input/ivf.c ++++ third_party/dav1d/tools/input/ivf.c +@@ -36,10 +36,6 @@ + + #include "input/demuxer.h" + +-#ifdef _MSC_VER +-#define ftello _ftelli64 +-#endif +- + typedef struct DemuxerPriv { + FILE *f; + } IvfInputContext; +@@ -55,7 +51,7 @@ static int64_t rl64(const uint8_t *const p) { + static int ivf_open(IvfInputContext *const c, const char *const file, + unsigned fps[2], unsigned *const num_frames) + { +- int res; ++ size_t res; + uint8_t hdr[32]; + + memset(c, 0, sizeof(*c)); +@@ -85,11 +81,11 @@ static int ivf_open(IvfInputContext *const c, const char *const file, + for (*num_frames = 0;; (*num_frames)++) { + if ((res = fread(data, 4, 1, c->f)) != 1) + break; // EOF +- fseek(c->f, rl32(data) + 8, SEEK_CUR); ++ fseeko(c->f, rl32(data) + 8, SEEK_CUR); + } + fps[0] *= *num_frames; + fps[1] *= duration; +- fseek(c->f, 32, SEEK_SET); ++ fseeko(c->f, 32, SEEK_SET); + + return 0; + } +@@ -97,7 +93,7 @@ static int ivf_open(IvfInputContext *const c, const char *const file, + static int ivf_read(IvfInputContext *const c, Dav1dData *const buf) { + uint8_t data[8]; + uint8_t *ptr; +- int res; ++ size_t res; + + const int64_t off = ftello(c->f); + if ((res = fread(data, 4, 1, c->f)) != 1) +diff --git third_party/dav1d/tools/output/md5.c third_party/dav1d/tools/output/md5.c +index 9f81bd44e996..6555de83adae 100644 +--- third_party/dav1d/tools/output/md5.c ++++ third_party/dav1d/tools/output/md5.c +@@ -63,11 +63,36 @@ static const unsigned k[] = { + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391, + }; + ++ ++#if ENDIANNESS_BIG ++#define NE2LE_32(x) (((x & 0x00ff) << 24) |\ ++ ((x & 0xff00) << 8) |\ ++ ((x >> 8) & 0xff00) |\ ++ ((x >> 24) & 0x00ff)) ++ ++#define NE2LE_64(x) (((x & 0x000000ff) << 56) |\ ++ ((x & 0x0000ff00) << 40) |\ ++ ((x & 0x00ff0000) << 24) |\ ++ ((x & 0xff000000) << 8) |\ ++ ((x >> 8) & 0xff000000) |\ ++ ((x >> 24) & 0x00ff0000) |\ ++ ((x >> 40) & 0x0000ff00) |\ ++ ((x >> 56) & 0x000000ff)) ++ ++#else ++#define NE2LE_32(x) (x) ++#define NE2LE_64(x) (x) ++#endif ++ + typedef struct MuxerPriv { + unsigned abcd[4]; + uint8_t data[64]; + uint64_t len; + FILE *f; ++#if ENDIANNESS_BIG ++ uint8_t *bswap; ++ int bswap_w; ++#endif + } MD5Context; + + static int md5_open(MD5Context *const md5, const char *const file, +@@ -81,6 +106,11 @@ static int md5_open(MD5Context *const md5, const char *const file, + return -1; + } + ++#if ENDIANNESS_BIG ++ md5->bswap = NULL; ++ md5->bswap_w = 0; ++#endif ++ + md5->abcd[0] = 0x67452301; + md5->abcd[1] = 0xefcdab89; + md5->abcd[2] = 0x98badcfe; +@@ -123,7 +153,7 @@ static void md5_body(MD5Context *md5, const uint8_t *const _data) { + tmp = d; + d = c; + c = b; +- b += leftrotate(a + f + k[i] + data[g], s[i >> 4][i & 3]); ++ b += leftrotate(a + f + k[i] + NE2LE_32(data[g]), s[i >> 4][i & 3]); + a = tmp; + } + +@@ -166,7 +196,26 @@ static int md5_write(MD5Context *const md5, Dav1dPicture *const p) { + const int w = p->p.w, h = p->p.h; + uint8_t *yptr = p->data[0]; + ++#if ENDIANNESS_BIG ++ if (hbd && (!md5->bswap || md5->bswap_w < p->p.w)) { ++ free(md5->bswap); ++ md5->bswap_w = 0; ++ md5->bswap = malloc(p->p.w << 1); ++ if (!md5->bswap) return -1; ++ md5->bswap_w = p->p.w; ++ } ++#endif ++ + for (int y = 0; y < h; y++) { ++#if ENDIANNESS_BIG ++ if (hbd) { ++ for (int x = 0; x < w; x++) { ++ md5->bswap[2 * x + 1] = yptr[2 * x]; ++ md5->bswap[2 * x] = yptr[2 * x + 1]; ++ } ++ md5_update(md5, md5->bswap, w << hbd); ++ } else ++#endif + md5_update(md5, yptr, w << hbd); + yptr += p->stride[0]; + } +@@ -180,6 +229,15 @@ static int md5_write(MD5Context *const md5, Dav1dPicture *const p) { + uint8_t *uvptr = p->data[pl]; + + for (int y = 0; y < ch; y++) { ++#if ENDIANNESS_BIG ++ if (hbd) { ++ for (int x = 0; x < cw; x++){ ++ md5->bswap[2 * x + 1] = uvptr[2 * x]; ++ md5->bswap[2 * x] = uvptr[2 * x + 1]; ++ } ++ md5_update(md5, md5->bswap, cw << hbd); ++ } else ++#endif + md5_update(md5, uvptr, cw << hbd); + uvptr += p->stride[1]; + } +@@ -193,7 +251,7 @@ static int md5_write(MD5Context *const md5, Dav1dPicture *const p) { + + static void md5_finish(MD5Context *const md5) { + static const uint8_t bit[2] = { 0x80, 0x00 }; +- uint64_t len = md5->len << 3; ++ uint64_t len = NE2LE_64(md5->len << 3); + + md5_update(md5, &bit[0], 1); + while ((md5->len & 63) != 56) +@@ -211,6 +269,11 @@ static void md5_close(MD5Context *const md5) { + md5->abcd[i] >> 24); + fprintf(md5->f, "\n"); + ++#if ENDIANNESS_BIG ++ free(md5->bswap); ++ md5->bswap_w = 0; ++#endif ++ + if (md5->f != stdout) + fclose(md5->f); + } +@@ -230,11 +293,16 @@ static int md5_verify(MD5Context *const md5, const char *const md5_str) { + char *ignore; + memcpy(t, p, 2); + p += 2; +- val = strtoul(t, &ignore, 16); ++ val = (unsigned) strtoul(t, &ignore, 16); + abcd[i] |= val << (8 * j); + } + } + ++#if ENDIANNESS_BIG ++ free(md5->bswap); ++ md5->bswap_w = 0; ++#endif ++ + return !!memcmp(abcd, md5->abcd, sizeof(abcd)); + } + +diff --git third_party/dav1d/tools/output/muxer.h third_party/dav1d/tools/output/muxer.h +index 2bc340f2a3f0..54b3f6aa13fd 100644 +--- third_party/dav1d/tools/output/muxer.h ++++ third_party/dav1d/tools/output/muxer.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_OUTPUT_MUXER_H__ +-#define __DAV1D_OUTPUT_MUXER_H__ ++#ifndef DAV1D_OUTPUT_MUXER_H ++#define DAV1D_OUTPUT_MUXER_H + + #include "picture.h" + +@@ -49,4 +49,4 @@ typedef struct Muxer { + int (*verify)(MuxerPriv *ctx, const char *hash_string); + } Muxer; + +-#endif /* __DAV1D_OUTPUT_MUXER_H__ */ ++#endif /* DAV1D_OUTPUT_MUXER_H */ +diff --git third_party/dav1d/tools/output/output.c third_party/dav1d/tools/output/output.c +index e48b3256205c..002d719788f4 100644 +--- third_party/dav1d/tools/output/output.c ++++ third_party/dav1d/tools/output/output.c +@@ -59,7 +59,7 @@ void init_muxers(void) { + } + + static const char *find_extension(const char *const f) { +- const int l = strlen(f); ++ const size_t l = strlen(f); + + if (l == 0) return NULL; + +diff --git third_party/dav1d/tools/output/output.h third_party/dav1d/tools/output/output.h +index 9eefaf82e259..c252d64a69dd 100644 +--- third_party/dav1d/tools/output/output.h ++++ third_party/dav1d/tools/output/output.h +@@ -25,8 +25,8 @@ + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +-#ifndef __DAV1D_OUTPUT_OUTPUT_H__ +-#define __DAV1D_OUTPUT_OUTPUT_H__ ++#ifndef DAV1D_OUTPUT_OUTPUT_H ++#define DAV1D_OUTPUT_OUTPUT_H + + #include "picture.h" + +@@ -46,4 +46,4 @@ void output_close(MuxerContext *ctx); + */ + int output_verify(MuxerContext *ctx, const char *hash_string); + +-#endif /* __DAV1D_OUTPUT_OUTPUT_H__ */ ++#endif /* DAV1D_OUTPUT_OUTPUT_H */ Index: www/firefox/files/patch-bug1536126 =================================================================== --- /dev/null +++ www/firefox/files/patch-bug1536126 @@ -0,0 +1,15 @@ +Disable RDD on Tier3 platforms due to lack of sandboxing and perf degradation. + +diff --git modules/libpref/init/StaticPrefList.h modules/libpref/init/StaticPrefList.h +index 7df0b518b267..3b309be1bfce 100644 +--- modules/libpref/init/StaticPrefList.h ++++ modules/libpref/init/StaticPrefList.h +@@ -1308,7 +1308,7 @@ VARCACHE_PREF( + # define PREF_VALUE true + #elif defined(XP_MACOSX) + # define PREF_VALUE true +-#elif defined(XP_UNIX) ++#elif defined(XP_LINUX) + # define PREF_VALUE true + #else + # define PREF_VALUE false Index: www/firefox/files/patch-bug1536538 =================================================================== --- /dev/null +++ www/firefox/files/patch-bug1536538 @@ -0,0 +1,92 @@ +commit 9a2aa30c6fad +Author: Alex Chronopoulos +Date: Wed Mar 20 14:24:23 2019 +0000 + + Bug 1536538 - Improve a check in config file and correct typos. r=TD-Linux + + Differential Revision: https://phabricator.services.mozilla.com/D24071 +--- + media/libdav1d/asm/moz.build | 4 +--- + media/libdav1d/config.h | 4 ++-- + media/libdav1d/moz.build | 4 ++-- + 3 files changed, 5 insertions(+), 7 deletions(-) + +diff --git media/libdav1d/asm/moz.build media/libdav1d/asm/moz.build +index 1cfa59cc17aa..e286c570e321 100644 +--- media/libdav1d/asm/moz.build ++++ media/libdav1d/asm/moz.build +@@ -25,8 +25,6 @@ CFLAGS += [ + '-I%s/dist/include/dav1d/' % TOPOBJDIR, + ] + +-# This code is only built on Windows and Linux for now. +- + # Attaching config.asm file + if CONFIG['CPU_ARCH'] == 'x86': + if CONFIG['OS_TARGET'] == 'WINNT': +@@ -45,7 +43,7 @@ if CONFIG['CPU_ARCH'] == 'x86_64': + error('Platform %s is not expected' % CONFIG['OS_TARGET']) + + if CONFIG['OS_TARGET'] in ('Darwin', 'WINNT'): +- # Change the default stack aligment (16) to 32 ++ # Change the default stack alignment (16) to 32 + if CONFIG['CC_TYPE'] == 'clang': + CFLAGS += ['-mstack-alignment=32'] + elif CONFIG['CC_TYPE'] == 'gcc': +diff --git media/libdav1d/config.h media/libdav1d/config.h +index 5ce57d302d62..33fd4bac8dac 100644 +--- media/libdav1d/config.h ++++ media/libdav1d/config.h +@@ -52,8 +52,8 @@ + # define PREFIX 1 + #endif + +-#if (ARCH_x86_32 == 1 || ARCH_X86_64 == 1) && defined(__linux__) && \ +- !defined(__ANDROID__) ++#if ARCH_x86_32 == 1 || \ ++ (ARCH_X86_64 == 1 && defined(__linux__) && !defined(__ANDROID__)) + # define STACK_ALIGNMENT 16 + #else + # define STACK_ALIGNMENT 32 +diff --git media/libdav1d/moz.build media/libdav1d/moz.build +index 45a5684c1d1d..8e6ea68d7565 100644 +--- media/libdav1d/moz.build ++++ media/libdav1d/moz.build +@@ -30,10 +30,10 @@ SOURCES += [f for f in entrypoint_source_files] + DEFINES['DAV1D_API'] = '' + + if CONFIG['MOZ_DAV1D_ASM']: +- # Default stack aligment is 16 bytes ++ # Default stack alignment is 16 bytes + DIRS += ['asm'] + if CONFIG['OS_TARGET'] in ('WINNT', 'Darwin') and CONFIG['CPU_ARCH'] == 'x86_64': +- # Update stack aligment to 32 bytes ++ # Update stack alignment to 32 bytes + if CONFIG['CC_TYPE'] == 'clang': + CFLAGS += ['-mstack-alignment=32'] + for ep in entrypoint_source_files: + +commit a9601261ed7d +Author: Alex Chronopoulos +Date: Wed Mar 20 11:34:39 2019 +0000 + + Bug 1536538 - Enable libdav1d on Linux. r=TD-Linux + + Differential Revision: https://phabricator.services.mozilla.com/D24072 +--- + modules/libpref/init/all.js | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git modules/libpref/init/all.js modules/libpref/init/all.js +index 9a6a18b25d4d..8c1180760965 100644 +--- modules/libpref/init/all.js ++++ modules/libpref/init/all.js +@@ -663,7 +663,7 @@ pref("media.av1.enabled", true); + pref("media.av1.use-dav1d", true); + #elif defined(XP_UNIX) + pref("media.av1.enabled", true); +-pref("media.av1.use-dav1d", false); ++pref("media.av1.use-dav1d", true); + #else + pref("media.av1.enabled", false); + pref("media.av1.use-dav1d", false); Index: www/firefox/files/patch-bug1536783 =================================================================== --- /dev/null +++ www/firefox/files/patch-bug1536783 @@ -0,0 +1,44 @@ +commit cbb81ed13612 +Author: Alex Chronopoulos +Date: Thu Mar 21 20:37:46 2019 +0000 + + Bug 1536783 - Use two tile threads in dav1d decoder for performance improvement. r=TD-Linux + + Differential Revision: https://phabricator.services.mozilla.com/D24331 + + --HG-- + extra : moz-landing-system : lando +--- + dom/media/platforms/agnostic/DAV1DDecoder.cpp | 5 +++++ + dom/media/platforms/agnostic/DAV1DDecoder.h | 2 +- + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git dom/media/platforms/agnostic/DAV1DDecoder.cpp dom/media/platforms/agnostic/DAV1DDecoder.cpp +index 7b8d2437765b..b57d5e47c950 100644 +--- dom/media/platforms/agnostic/DAV1DDecoder.cpp ++++ dom/media/platforms/agnostic/DAV1DDecoder.cpp +@@ -31,6 +31,11 @@ RefPtr DAV1DDecoder::Init() { + } + settings.n_frame_threads = + static_cast(std::min(decoder_threads, GetNumberOfProcessors())); ++ // There is not much improvement with more than 2 tile threads at least with ++ // the content being currently served. The ideal number of tile thread would ++ // much the tile count of the content. Maybe dav1d can help to do that in the ++ // future. ++ settings.n_tile_threads = 2; + + int res = dav1d_open(&mContext, &settings); + if (res < 0) { +diff --git dom/media/platforms/agnostic/DAV1DDecoder.h dom/media/platforms/agnostic/DAV1DDecoder.h +index 749d3081876f..297d1fa1dd63 100644 +--- dom/media/platforms/agnostic/DAV1DDecoder.h ++++ dom/media/platforms/agnostic/DAV1DDecoder.h +@@ -38,7 +38,7 @@ class DAV1DDecoder : public MediaDataDecoder, + int GetPicture(DecodedData& aData, MediaResult& aResult); + already_AddRefed ConstructImage(const Dav1dPicture& aPicture); + +- Dav1dContext* mContext; ++ Dav1dContext* mContext = nullptr; + + const VideoInfo& mInfo; + const RefPtr mTaskQueue; Index: www/firefox/files/patch-z-bug1535631 =================================================================== --- /dev/null +++ www/firefox/files/patch-z-bug1535631 @@ -0,0 +1,118 @@ +commit 30586d8fe58f +Author: Alex Chronopoulos +Date: Thu Mar 21 19:43:15 2019 +0000 + + Bug 1535631 - Use 16 byte stack alignment on dav1d in OSX. r=TD-Linux + + Differential Revision: https://phabricator.services.mozilla.com/D24382 + + --HG-- + extra : moz-landing-system : lando +--- + media/libdav1d/asm/moz.build | 11 +++++++++-- + media/libdav1d/asm/x86_64/osx/config.asm | 2 +- + media/libdav1d/config.h | 7 ------- + media/libdav1d/moz.build | 26 +++++++++++++++----------- + 4 files changed, 25 insertions(+), 21 deletions(-) + +diff --git media/libdav1d/asm/moz.build media/libdav1d/asm/moz.build +index e286c570e321..fa0b1dfd06f9 100644 +--- media/libdav1d/asm/moz.build ++++ media/libdav1d/asm/moz.build +@@ -25,6 +25,9 @@ CFLAGS += [ + '-I%s/dist/include/dav1d/' % TOPOBJDIR, + ] + ++# Default stack aligment is 16 bytes. ++stack_alignment = 16 ++ + # Attaching config.asm file + if CONFIG['CPU_ARCH'] == 'x86': + if CONFIG['OS_TARGET'] == 'WINNT': +@@ -42,13 +45,17 @@ if CONFIG['CPU_ARCH'] == 'x86_64': + else: + error('Platform %s is not expected' % CONFIG['OS_TARGET']) + +- if CONFIG['OS_TARGET'] in ('Darwin', 'WINNT'): +- # Change the default stack alignment (16) to 32 ++ if CONFIG['OS_TARGET'] == 'WINNT': ++ # Change the default stack alignment (16) to 32 bytes. ++ stack_alignment = 32 + if CONFIG['CC_TYPE'] == 'clang': + CFLAGS += ['-mstack-alignment=32'] + elif CONFIG['CC_TYPE'] == 'gcc': + CFLAGS += ['-mpreferred-stack-boundary=5'] + ++# Set the macro here instead of config.h ++DEFINES['STACK_ALIGNMENT'] = stack_alignment ++ + if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'): + SOURCES += [ + '../../../third_party/dav1d/src/x86/cpu.c', +diff --git media/libdav1d/asm/x86_64/osx/config.asm media/libdav1d/asm/x86_64/osx/config.asm +index 4cc58a254517..d1ad0b196ec9 100644 +--- media/libdav1d/asm/x86_64/osx/config.asm ++++ media/libdav1d/asm/x86_64/osx/config.asm +@@ -9,5 +9,5 @@ + + %define PREFIX 1 + +-%define STACK_ALIGNMENT 32 ++%define STACK_ALIGNMENT 16 + +diff --git media/libdav1d/config.h media/libdav1d/config.h +index 33fd4bac8dac..ceeae4a07b44 100644 +--- media/libdav1d/config.h ++++ media/libdav1d/config.h +@@ -52,13 +52,6 @@ + # define PREFIX 1 + #endif + +-#if ARCH_x86_32 == 1 || \ +- (ARCH_X86_64 == 1 && defined(__linux__) && !defined(__ANDROID__)) +-# define STACK_ALIGNMENT 16 +-#else +-# define STACK_ALIGNMENT 32 +-#endif +- + #if defined(_WIN32) || defined(__CYGWIN__) + // _WIN32_WINNT 0x0601 is set in global macros + # define UNICODE 1 +diff --git media/libdav1d/moz.build media/libdav1d/moz.build +index 8e6ea68d7565..7f9eb833c8a9 100644 +--- media/libdav1d/moz.build ++++ media/libdav1d/moz.build +@@ -30,18 +30,22 @@ SOURCES += [f for f in entrypoint_source_files] + DEFINES['DAV1D_API'] = '' + + if CONFIG['MOZ_DAV1D_ASM']: +- # Default stack alignment is 16 bytes ++ # Default stack alignment is 16 bytes. ++ stack_alignment = 16 + DIRS += ['asm'] +- if CONFIG['OS_TARGET'] in ('WINNT', 'Darwin') and CONFIG['CPU_ARCH'] == 'x86_64': +- # Update stack alignment to 32 bytes +- if CONFIG['CC_TYPE'] == 'clang': +- CFLAGS += ['-mstack-alignment=32'] +- for ep in entrypoint_source_files: +- SOURCES[ep].flags += ['-mstackrealign'] +- elif CONFIG['CC_TYPE'] == 'gcc': +- CFLAGS += ['-mpreferred-stack-boundary=5'] +- for ep in entrypoint_source_files: +- SOURCES[ep].flags += ['-mincoming-stack-boundary=4'] ++ if CONFIG['OS_TARGET'] == 'WINNT' and CONFIG['CPU_ARCH'] == 'x86_64': ++ stack_alignment = 32 ++ # Update stack alignment to 32 bytes. ++ if CONFIG['CC_TYPE'] == 'clang': ++ CFLAGS += ['-mstack-alignment=32'] ++ for ep in entrypoint_source_files: ++ SOURCES[ep].flags += ['-mstackrealign'] ++ elif CONFIG['CC_TYPE'] == 'gcc': ++ CFLAGS += ['-mpreferred-stack-boundary=5'] ++ for ep in entrypoint_source_files: ++ SOURCES[ep].flags += ['-mincoming-stack-boundary=4'] ++ # Set the macro here instead of config.h ++ DEFINES['STACK_ALIGNMENT'] = stack_alignment + + if CONFIG['OS_TARGET'] == 'Linux': + # For fuzzing, We only support building on Linux currently. Index: www/firefox/files/patch-z-bug1536070 =================================================================== --- /dev/null +++ www/firefox/files/patch-z-bug1536070 @@ -0,0 +1,107 @@ +commit fe1ca24c8b01 +Author: Alex Chronopoulos +Date: Fri Mar 22 14:18:27 2019 +0000 + + Bug 1536070 - Enable ASM in dav1d for Tier3 platforms. r=TD-Linux + + Differential Revision: https://phabricator.services.mozilla.com/D24361 + + --HG-- + extra : moz-landing-system : lando +--- + media/libdav1d/asm/moz.build | 16 ++++++++++++---- + media/libdav1d/moz.build | 21 +++++++++++++++++++-- + modules/libpref/init/all.js | 2 +- + toolkit/moz.configure | 6 +----- + 4 files changed, 33 insertions(+), 12 deletions(-) + +diff --git media/libdav1d/asm/moz.build media/libdav1d/asm/moz.build +index fa0b1dfd06f9..b12240139789 100644 +--- media/libdav1d/asm/moz.build ++++ media/libdav1d/asm/moz.build +@@ -34,16 +34,24 @@ if CONFIG['CPU_ARCH'] == 'x86': + ASFLAGS += ['-I%s/media/libdav1d/asm/x86_32/win/' % TOPSRCDIR] + else: + ASFLAGS += ['-I%s/media/libdav1d/asm/x86_32/' % TOPSRCDIR] ++ # Default stack aligment can be 4 bytes, change it to 16 bytes. ++ if CONFIG['CC_TYPE'] == 'clang': ++ CFLAGS += ['-mstack-alignment=16'] ++ elif CONFIG['CC_TYPE'] == 'gcc': ++ CFLAGS += ['-mpreferred-stack-boundary=4'] ++ + + if CONFIG['CPU_ARCH'] == 'x86_64': +- if CONFIG['OS_TARGET'] == 'Linux': +- ASFLAGS += ['-I%s/media/libdav1d/asm/x86_64/linux/' % TOPSRCDIR] +- elif CONFIG['OS_TARGET'] == 'Darwin': ++ if CONFIG['OS_TARGET'] == 'Darwin': + ASFLAGS += ['-I%s/media/libdav1d/asm/x86_64/osx/' % TOPSRCDIR] + elif CONFIG['OS_TARGET'] == 'WINNT': + ASFLAGS += ['-I%s/media/libdav1d/asm/x86_64/' % TOPSRCDIR] +- else: ++ elif CONFIG['OS_TARGET'] == 'Android': + error('Platform %s is not expected' % CONFIG['OS_TARGET']) ++ else: ++ # The rest of the platforms are all Linux flavors ++ # Linux,OpenBSD,NetBSD,FreeBSD,DragonFly,SunOS ++ ASFLAGS += ['-I%s/media/libdav1d/asm/x86_64/linux/' % TOPSRCDIR] + + if CONFIG['OS_TARGET'] == 'WINNT': + # Change the default stack alignment (16) to 32 bytes. +diff --git media/libdav1d/moz.build media/libdav1d/moz.build +index 7f9eb833c8a9..788c1a5ab971 100644 +--- media/libdav1d/moz.build ++++ media/libdav1d/moz.build +@@ -30,9 +30,25 @@ SOURCES += [f for f in entrypoint_source_files] + DEFINES['DAV1D_API'] = '' + + if CONFIG['MOZ_DAV1D_ASM']: +- # Default stack alignment is 16 bytes. +- stack_alignment = 16 + DIRS += ['asm'] ++ ++ # Default stack alignment can be 4 bytes on x86. ++ if CONFIG['CPU_ARCH'] == 'x86': ++ # Update stack alignment to 16 bytes. ++ if CONFIG['CC_TYPE'] == 'clang': ++ CFLAGS += ['-mstack-alignment=16'] ++ for ep in entrypoint_source_files: ++ SOURCES[ep].flags += ['-mstackrealign'] ++ elif CONFIG['CC_TYPE'] == 'gcc': ++ CFLAGS += ['-mpreferred-stack-boundary=4'] ++ for ep in entrypoint_source_files: ++ SOURCES[ep].flags += ['-mincoming-stack-boundary=2'] ++ ++ # Expect stack alignment of 16 bytes. ++ stack_alignment = 16 ++ ++ # The default stack alignment in x86_64 is 16 bytes. On all Linux flavors the ++ # default is used due to crashes with 32 bytes stack alignment. + if CONFIG['OS_TARGET'] == 'WINNT' and CONFIG['CPU_ARCH'] == 'x86_64': + stack_alignment = 32 + # Update stack alignment to 32 bytes. +@@ -44,6 +60,7 @@ if CONFIG['MOZ_DAV1D_ASM']: + CFLAGS += ['-mpreferred-stack-boundary=5'] + for ep in entrypoint_source_files: + SOURCES[ep].flags += ['-mincoming-stack-boundary=4'] ++ + # Set the macro here instead of config.h + DEFINES['STACK_ALIGNMENT'] = stack_alignment + +diff --git toolkit/moz.configure toolkit/moz.configure +index 3f810b358918..bd2f4d130e41 100644 +--- toolkit/moz.configure ++++ toolkit/moz.configure +@@ -443,11 +443,7 @@ def av1(value): + + @depends(target, nasm_version, when=av1 & compile_environment) + def dav1d_asm(target, nasm_version): +- if ( +- target.os == 'GNU' and target.kernel == 'Linux' and target.cpu in ('x86', 'x86_64') or +- target.os == 'WINNT' and target.cpu in ('x86', 'x86_64') or +- target.os == 'OSX' and target.kernel == 'Darwin' and target.cpu == 'x86_64' +- ): ++ if target.os != 'Android' and target.cpu in ('x86', 'x86_64'): + if nasm_version < '2.13': + die('nasm 2.13 or greater is required for AV1 support. ' + 'Either install nasm or add --disable-av1 to your configure options.')