fraunhoferhhi
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/Lib/CommonLib/AdaptiveLoopFilter.cpp
Lines changed: 18 additions & 17 deletions b/‎source/Lib/CommonLib/AdaptiveLoopFilter.cpp
Lines changed: 18 additions & 17 deletions
diff --git a/‎source/Lib/CommonLib/AdaptiveLoopFilter.h
Lines changed: 5 additions & 5 deletions b/‎source/Lib/CommonLib/AdaptiveLoopFilter.h
Lines changed: 5 additions & 5 deletions
diff --git a/‎source/Lib/CommonLib/BitStream.cpp
Lines changed: 4 additions & 3 deletions b/‎source/Lib/CommonLib/BitStream.cpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎source/Lib/CommonLib/BitStream.h
Lines changed: 16 additions & 14 deletions b/‎source/Lib/CommonLib/BitStream.h
Lines changed: 16 additions & 14 deletions
diff --git a/‎source/Lib/CommonLib/Buffer.h
Lines changed: 5 additions & 5 deletions b/‎source/Lib/CommonLib/Buffer.h
Lines changed: 5 additions & 5 deletions
@@ -425,7 +425,7 @@ endforeach()
 get_directory_property( ALL_TESTS TESTS )
 set_tests_properties( ${ALL_TESTS} PROPERTIES
                       TIMEOUT 120
-                      FAIL_REGULAR_EXPRESSION "WARNING:" )
+                      FAIL_REGULAR_EXPRESSION "(WARNING:|runtime error)" )
 
 add_custom_target( test-ok  USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} -C $<CONFIG> -R "\"^(Test|MISSING)\"" )
 add_custom_target( test-all USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} -C $<CONFIG> -R "\"^(Test|MISSING|Faulty)\"" )
 
@@ -199,7 +199,7 @@ bool AdaptiveLoopFilter::isClipOrCrossedByVirtualBoundaries( const CodingStructu
     const CodingUnit* prevCtu = cs.getCU( prevCtuPos, CHANNEL_TYPE_LUMA );
     if( !CU::isAvailable( *currCtu,
                           *prevCtu,
-                          !loopFilterAcrossTilesEnabledFlag,
+                          !loopFilterAcrossSlicesEnabledFlag,
                           !loopFilterAcrossTilesEnabledFlag,
                           !loopFilterAcrossSubPicEnabledFlag ) )
     {
@@ -214,8 +214,8 @@ bool AdaptiveLoopFilter::isClipOrCrossedByVirtualBoundaries( const CodingStructu
     const CodingUnit* nextCtu = cs.getCU( nextCtuPos, CHANNEL_TYPE_LUMA );
     if( !CU::isAvailable( *currCtu,
                           *nextCtu,
-                          !pps->getLoopFilterAcrossSlicesEnabledFlag(),
-                          !pps->getLoopFilterAcrossTilesEnabledFlag(),
+                          !loopFilterAcrossSlicesEnabledFlag,
+                          !loopFilterAcrossTilesEnabledFlag,
                           !loopFilterAcrossSubPicEnabledFlag ) )
     {
       clipBottom = true;
@@ -229,8 +229,8 @@ bool AdaptiveLoopFilter::isClipOrCrossedByVirtualBoundaries( const CodingStructu
     const CodingUnit* prevCtu = cs.getCU( prevCtuPos, CHANNEL_TYPE_LUMA );
     if( !CU::isAvailable( *currCtu,
                           *prevCtu,
-                          !pps->getLoopFilterAcrossSlicesEnabledFlag(),
-                          !pps->getLoopFilterAcrossTilesEnabledFlag(),
+                          !loopFilterAcrossSlicesEnabledFlag,
+                          !loopFilterAcrossTilesEnabledFlag,
                           !loopFilterAcrossSubPicEnabledFlag ) )
     {
       clipLeft = true;
@@ -245,8 +245,8 @@ bool AdaptiveLoopFilter::isClipOrCrossedByVirtualBoundaries( const CodingStructu
 
     if( !CU::isAvailable( *currCtu,
                           *nextCtu,
-                          !pps->getLoopFilterAcrossSlicesEnabledFlag(),
-                          !pps->getLoopFilterAcrossTilesEnabledFlag(),
+                          !loopFilterAcrossSlicesEnabledFlag,
+                          !loopFilterAcrossTilesEnabledFlag,
                           !loopFilterAcrossSubPicEnabledFlag ) )
     {
       clipRight = true;
@@ -261,21 +261,21 @@ bool AdaptiveLoopFilter::isClipOrCrossedByVirtualBoundaries( const CodingStructu
     {
       const Position prevCtuPos( area.x - ctuSize, area.y - ctuSize );
       const CodingUnit *prevCtu = cs.getCU( prevCtuPos, CHANNEL_TYPE_LUMA );
-      if ( !pps->getLoopFilterAcrossSlicesEnabledFlag() && !CU::isSameSlice( *currCtu, *prevCtu ) )
+      if ( !loopFilterAcrossSlicesEnabledFlag && !CU::isSameSlice( *currCtu, *prevCtu ) )
       {
         rasterSliceAlfPad = 1;
       }
     }
   }
 
-  if ( !clipBottom && !clipRight && restrictSlices )
+  if( !clipBottom && !clipRight && restrictSlices )
   {
     //bottom-right CTU
     if ( area.x + ctuSize < cs.pcv->lumaWidth && area.y + ctuSize < cs.pcv->lumaHeight )
     {
       const Position nextCtuPos( area.x + ctuSize, area.y + ctuSize );
       const CodingUnit *nextCtu = cs.getCU( nextCtuPos, CHANNEL_TYPE_LUMA );
-      if ( !pps->getLoopFilterAcrossSlicesEnabledFlag() && !CU::isSameSlice( *currCtu, *nextCtu ) )
+      if ( !loopFilterAcrossSlicesEnabledFlag && !CU::isSameSlice( *currCtu, *nextCtu ) )
       {
         rasterSliceAlfPad += 2;
       }
@@ -460,16 +460,17 @@ void AdaptiveLoopFilter::prepareCTU( CodingStructure &cs, unsigned col, unsigned
 void AdaptiveLoopFilter::processCTU( CodingStructure & cs, unsigned col, unsigned line, int tid, const ChannelType chType )
 {
   PROFILER_SCOPE_AND_STAGE_EXT( 1, g_timeProfiler, P_ALF, cs, CH_L );
-  PelUnitBuf recYuv = cs.getRecoBuf();
 
   const UnitArea ctuArea( getCtuArea( cs, col, line, true ) );
+  CPelUnitBuf    recYuv = cs.getRecoBuf().subBuf( ctuArea );
+  PelUnitBuf     dstYuv = m_alfBuf.subBuf( ctuArea );
 
   const unsigned ctuIdx  = line * cs.pcv->widthInCtus + col;
   CtuAlfData currAlfData = cs.getCtuData( col, line ).alfParam;
   currAlfData.alfCtuEnableFlag[1] += currAlfData.ccAlfFilterControl[0] > 0 ? 2 : 0;
   currAlfData.alfCtuEnableFlag[2] += currAlfData.ccAlfFilterControl[1] > 0 ? 2 : 0;
 
-  filterCTU( recYuv.subBuf( ctuArea ), m_alfBuf.subBuf( ctuArea ), currAlfData, cs.picture->slices[0]->getClpRngs(), chType, cs, ctuIdx, ctuArea.lumaPos(), tid );
+  filterCTU( recYuv, dstYuv, currAlfData, cs.picture->slices[0]->getClpRngs(), chType, cs, ctuIdx, ctuArea.lumaPos(), tid );
 }
 
 bool AdaptiveLoopFilter::getAlfSkipPic( const CodingStructure & cs )
@@ -489,7 +490,7 @@ bool AdaptiveLoopFilter::getAlfSkipPic( const CodingStructure & cs )
 }
 
 void AdaptiveLoopFilter::filterAreaLuma( const CPelUnitBuf& srcBuf,
-                                         const PelUnitBuf&  dstBuf,
+                                               PelUnitBuf&  dstBuf,
                                          const Area&        blk,
                                          const Slice*       slice,
                                          const APS* const*  aps,
@@ -535,7 +536,7 @@ void AdaptiveLoopFilter::filterAreaLuma( const CPelUnitBuf& srcBuf,
 }
 
 void AdaptiveLoopFilter::filterAreaChroma( const CPelUnitBuf& srcBuf,
-                                           const PelUnitBuf&  dstBuf,
+                                                 PelUnitBuf&  dstBuf,
                                            const Area&        blkChroma,
                                            const ComponentID  compID,
                                            const Slice*       slice,
@@ -575,7 +576,7 @@ void AdaptiveLoopFilter::filterAreaChroma( const CPelUnitBuf& srcBuf,
 }
 
 void AdaptiveLoopFilter::filterAreaChromaCc( const CPelUnitBuf& srcBuf,
-                                             const PelUnitBuf&  dstBuf,
+                                                   PelUnitBuf&  dstBuf,
                                              const Area&        blkLuma,
                                              const Area&        blkChroma,
                                              const ComponentID  compID,
@@ -601,7 +602,7 @@ void AdaptiveLoopFilter::filterAreaChromaCc( const CPelUnitBuf& srcBuf,
 }
 
 void AdaptiveLoopFilter::filterAreaChromaBothCc( const CPelUnitBuf& srcBuf,
-                                                 const PelUnitBuf&  dstBuf,
+                                                       PelUnitBuf&  dstBuf,
                                                  const Area&        blkLuma,
                                                  const Area&        blkChroma,
                                                  const Slice*       slice,
@@ -653,7 +654,7 @@ void AdaptiveLoopFilter::filterAreaChromaBothCc( const CPelUnitBuf& srcBuf,
 }
 
 void AdaptiveLoopFilter::filterCTU( const CPelUnitBuf&     srcBuf,
-                                    const PelUnitBuf&      dstBuf,
+                                           PelUnitBuf&     dstBuf,
                                     const CtuAlfData&      ctuAlfData,
                                     const ClpRngs&         clpRngs,
                                     const ChannelType      chType,
 
@@ -121,11 +121,11 @@ class AdaptiveLoopFilter
   static void deriveClassificationBlk( AlfClassifier *classifier, const CPelBuf& srcLuma, const Area& blk, const int shift, int vbCTUHeight, int vbPos );
   void ( *m_deriveClassificationBlk )( AlfClassifier *classifier, const CPelBuf& srcLuma, const Area& blk, const int shift, int vbCTUHeight, int vbPos );
 
-  void filterCTU                     ( const CPelUnitBuf& srcBuf, const PelUnitBuf& dstBuf, const CtuAlfData& ctuAlfData, const ClpRngs& clpRngs, const ChannelType chType, const CodingStructure& cs, int ctuIdx, Position ctuPos, int tid );
-  void filterAreaLuma                ( const CPelUnitBuf& srcBuf, const PelUnitBuf& dstBuf, const Area& blk, const Slice* slice, const APS* const* aps, const short filterSetIndex, const ClpRngs& clpRngs, const int tId );
-  void filterAreaChroma              ( const CPelUnitBuf& srcBuf, const PelUnitBuf& dstBuf, const Area& blkChroma, const ComponentID compID, const Slice* slice, const APS* const* aps, const CtuAlfData& ctuAlfData, const ClpRngs& clpRngs );
-  void filterAreaChromaCc            ( const CPelUnitBuf& srcBuf, const PelUnitBuf& dstBuf, const Area& blkLuma, const Area& blkChroma, const ComponentID compID, const Slice* slice, const APS* const* aps, const CtuAlfData& ctuAlfData, const ClpRngs& clpRngs );
-  void filterAreaChromaBothCc        ( const CPelUnitBuf& srcBuf, const PelUnitBuf& dstBuf, const Area& blkLuma, const Area& blkChroma, const Slice* slice, const APS* const* aps, const CtuAlfData& ctuAlfData, const ClpRngs& clpRngs );
+  void filterCTU                     ( const CPelUnitBuf& srcBuf, PelUnitBuf& dstBuf, const CtuAlfData& ctuAlfData, const ClpRngs& clpRngs, const ChannelType chType, const CodingStructure& cs, int ctuIdx, Position ctuPos, int tid );
+  void filterAreaLuma                ( const CPelUnitBuf& srcBuf, PelUnitBuf& dstBuf, const Area& blk, const Slice* slice, const APS* const* aps, const short filterSetIndex, const ClpRngs& clpRngs, const int tId );
+  void filterAreaChroma              ( const CPelUnitBuf& srcBuf, PelUnitBuf& dstBuf, const Area& blkChroma, const ComponentID compID, const Slice* slice, const APS* const* aps, const CtuAlfData& ctuAlfData, const ClpRngs& clpRngs );
+  void filterAreaChromaCc            ( const CPelUnitBuf& srcBuf, PelUnitBuf& dstBuf, const Area& blkLuma, const Area& blkChroma, const ComponentID compID, const Slice* slice, const APS* const* aps, const CtuAlfData& ctuAlfData, const ClpRngs& clpRngs );
+  void filterAreaChromaBothCc        ( const CPelUnitBuf& srcBuf, PelUnitBuf& dstBuf, const Area& blkLuma, const Area& blkChroma, const Slice* slice, const APS* const* aps, const CtuAlfData& ctuAlfData, const ClpRngs& clpRngs );
 
   template<AlfFilterType filtType>
   static void filterBlk              ( const AlfClassifier *classifier, const PelUnitBuf &recDst, const CPelUnitBuf& recSrc, const Area& blk, const ComponentID compId, const short* filterSet, const short* fClipSet, const ClpRng& clpRng, int vbCTUHeight, int vbPos );
 
@@ -102,6 +102,7 @@ uint32_t InputBitstream::read (uint32_t uiNumberOfBits)
   m_numBitsRead += uiNumberOfBits;
 #endif
 
+  constexpr static uint64_t ONES = ~static_cast<uint64_t>( 0 );   // need to ensure 64 bits for the mask, because shift by 32 is UB for uin32_t
 
   /* NB, bits are extracted from the MSB of each byte. */
   uint32_t retval = 0;
@@ -111,7 +112,7 @@ uint32_t InputBitstream::read (uint32_t uiNumberOfBits)
      * n=3, len(H)=7:   -VVV HHHH, shift_down=4, mask=0xf8
      */
     retval = static_cast<uint32_t>( m_held_bits >> ( m_num_held_bits - uiNumberOfBits ) );
-    retval &= ~( ~0u << uiNumberOfBits );
+    retval &= ~( ONES << uiNumberOfBits );
     m_num_held_bits -= uiNumberOfBits;
 
     return retval;
@@ -127,7 +128,7 @@ uint32_t InputBitstream::read (uint32_t uiNumberOfBits)
     /* n=5, len(H)=3: ---- -VVV, mask=0x07, shift_up=5-3=2,
      * n=9, len(H)=3: ---- -VVV, mask=0x07, shift_up=9-3=6 */
     uiNumberOfBits -= m_num_held_bits;
-    retval = static_cast<uint32_t>( m_held_bits ) & ~( ~0u << m_num_held_bits );   // we can cast to 32 bits, because the held bits are the rightmost bits
+    retval = static_cast<uint32_t>( m_held_bits ) & ~( ONES << m_num_held_bits );   // we can cast to 32 bits, because the held bits are the rightmost bits
     retval <<= uiNumberOfBits;
   }
 
@@ -170,7 +171,7 @@ std::unique_ptr<InputBitstream> InputBitstream::extractSubstream( uint32_t uiNum
 {
   std::unique_ptr<InputBitstream> substream( new InputBitstream );
 
-  std::vector<uint8_t>& buf = substream->getFifo();
+  AlignedByteVec& buf = substream->getFifo();
   buf.reserve( ( uiNumBits + 7 ) / 8 + 1 );    // +1 because a zero byte might be added later
 
   const uint32_t uiNumBytes = uiNumBits / 8;   // don't round up here, because the remaing bits will be copied later
 
@@ -71,7 +71,7 @@ namespace vvdec
 class InputBitstream
 {
 private:
-  std::vector<uint8_t>  m_fifo;   /// FIFO for storage of complete bytes
+  AlignedByteVec        m_fifo;   /// FIFO for storage of complete bytes
   std::vector<uint32_t> m_emulationPreventionByteLocation;
 
   uint32_t m_fifo_idx = 0;   /// Read index into m_fifo
@@ -178,15 +178,16 @@ class InputBitstream
   void                         setEmulationPreventionByteLocation( const std::vector<uint32_t>& vec ) { m_emulationPreventionByteLocation = vec; }
   void                         clearEmulationPreventionByteLocation()                                 { m_emulationPreventionByteLocation.clear(); }
 
-  const std::vector<uint8_t>& getFifo() const { return m_fifo; }
-        std::vector<uint8_t>& getFifo()       { return m_fifo; }
-  void                        clearFifo()     { m_fifo.clear(); m_zeroByteAdded = false; }
+  const AlignedByteVec& getFifo() const { return m_fifo; }
+        AlignedByteVec& getFifo()       { return m_fifo; }
+  void                  clearFifo()     { m_fifo.clear(); m_zeroByteAdded = false; }
 
 private:
   inline void load_next_bits( int requiredBits )
   {
     uint32_t num_bytes_to_load = 8;
-    if UNLIKELY( m_fifo_idx + num_bytes_to_load > m_fifo.size() )
+    if UNLIKELY( m_fifo_idx + num_bytes_to_load > m_fifo.size()   // end of bitstream
+                 || ( m_fifo_idx & 0x7 ) != 0 )                   // unaligned read position (m_fifo should be aligned)
     {
       const int required_bytes = ( requiredBits + 7 ) >> 3;
       CHECK( m_fifo_idx + required_bytes > m_fifo.size(), "Exceeded FIFO size" );
@@ -196,23 +197,24 @@ class InputBitstream
       m_held_bits = 0;
       switch( num_bytes_to_load )
       {
-      case 8: m_held_bits =  static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 7 * 8 );
-      case 7: m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 6 * 8 );
-      case 6: m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 5 * 8 );
-      case 5: m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 4 * 8 );
-      case 4: m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 3 * 8 );
-      case 3: m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 2 * 8 );
-      case 2: m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 1 * 8 );
-      case 1: m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] );
+      default: num_bytes_to_load = 8;   // in the unaligned case num_bytes_to_load could be >8
+      case 8:  m_held_bits =  static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 7 * 8 );
+      case 7:  m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 6 * 8 );
+      case 6:  m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 5 * 8 );
+      case 5:  m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 4 * 8 );
+      case 4:  m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 3 * 8 );
+      case 3:  m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 2 * 8 );
+      case 2:  m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] ) << ( 1 * 8 );
+      case 1:  m_held_bits |= static_cast<uint64_t>( m_fifo[m_fifo_idx++] );
       }
     }
     else
     {
+      CHECKD( reinterpret_cast<intptr_t>( &m_fifo[m_fifo_idx] ) & 0x7, "bistream read pos unaligned" );
       m_held_bits = simde_bswap64( *reinterpret_cast<uint64_t*>( &m_fifo[m_fifo_idx] ) );
       m_fifo_idx += num_bytes_to_load;
     }
 
-    /* resolve remainder bits */
     m_num_held_bits = num_bytes_to_load * 8;
   }
 };
 
@@ -150,7 +150,7 @@ struct AreaBuf : public Size
   void fill                 ( const T &val );
   void memset               ( const int val );
 
-  void copyFrom             ( const AreaBuf<const T> &other ) const;
+  void copyFrom             ( const AreaBuf<const T> &other );
 
   void reconstruct          ( const AreaBuf<const T> &pred, const AreaBuf<const T> &resi, const ClpRng& clpRng);
 
@@ -396,7 +396,7 @@ void AreaBuf<T>::memset( const int val )
 
 #if ENABLE_SIMD_OPT_BUFFER
 template<typename T>
-void AreaBuf<T>::copyFrom( const AreaBuf<const T> &other ) const
+void AreaBuf<T>::copyFrom( const AreaBuf<const T> &other )
 {
 #if !defined(__GNUC__) || __GNUC__ > 5
   static_assert( std::is_trivially_copyable<T>::value, "Type T is not trivially_copyable" );
@@ -406,7 +406,7 @@ void AreaBuf<T>::copyFrom( const AreaBuf<const T> &other ) const
 }
 #else
 template<typename T>
-void AreaBuf<T>::copyFrom( const AreaBuf<const T> &other ) const
+void AreaBuf<T>::copyFrom( const AreaBuf<const T> &other )
 {
 #if !defined(__GNUC__) || __GNUC__ > 5
   static_assert( std::is_trivially_copyable<T>::value, "Type T is not trivially_copyable" );
@@ -711,7 +711,7 @@ struct UnitBuf
   const AreaBuf<T>& Cr() const { return bufs[2]; }
 
   void fill                 ( const T &val );
-  void copyFrom             ( const UnitBuf<const T> &other ) const;
+  void copyFrom             ( const UnitBuf<const T> &other );
   void reconstruct          ( const UnitBuf<const T> &pred, const UnitBuf<const T> &resi, const ClpRngs& clpRngs );
   void subtract             ( const UnitBuf<const T> &other );
   void addWeightedAvg       ( const UnitBuf<      T> &other1, const UnitBuf<      T> &other2, const ClpRngs& clpRngs, const uint8_t bcwIdx = BCW_DEFAULT, const bool chromaOnly = false, const bool lumaOnly = false);
@@ -751,7 +751,7 @@ void UnitBuf<T>::fill( const T &val )
 }
 
 template<typename T>
-void UnitBuf<T>::copyFrom( const UnitBuf<const T> &other ) const
+void UnitBuf<T>::copyFrom( const UnitBuf<const T> &other )
 {
   CHECK( chromaFormat != other.chromaFormat, "Incompatible formats" );