From 490a3c95f92c727eed4d60959460b3c9082db67a Mon Sep 17 00:00:00 2001
From: Monty Montgomery <xiphmont@gmail.com>
Date: Thu, 14 May 2020 03:10:19 -0400
Subject: [PATCH 1/2] Widen useful range of BitWriter's write_quniform()

The versions of write_quiniform() used by the arithmetic packer and the uncompressed bitwriter vary considerably.  For some reason, the BitWriter uses a much 'narrower' implementation that overflows with inputs of more than a few bits.

This patch replaces the BitWriter's implementation with one like in the arithmetic packer.  This allows its use in coding non-uniform tile header fields.
---
 src/ec.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/src/ec.rs b/src/ec.rs
index 12af200091..5be36dc0a4 100644
--- a/src/ec.rs
+++ b/src/ec.rs
@@ -804,17 +804,17 @@ impl<W: io::Write> BCodeWriter for BitWriter<W, BigEndian> {
     }
   }
   fn write_quniform(&mut self, n: u16, v: u16) -> Result<(), std::io::Error> {
-    /* Encodes a value v in [0, n-1] quasi-uniformly */
-    if n <= 1 {
-      return Ok(());
-    };
-    let l = 31 ^ ((n - 1) + 1).leading_zeros();
-    let m = (1 << l) - n;
-    if v < m {
-      self.write(l - 1, v)
+    if n > 1 {
+      let l = msb(n as i32) as u8 + 1;
+      let m = (1 << l) - n;
+      if v < m {
+        self.write(l as u32 - 1, v)
+      } else {
+        self.write(l as u32 - 1, m + ((v - m) >> 1))?;
+        self.write(1, (v - m) & 1)
+      }
     } else {
-      self.write(l - 1, m + ((v - m) >> 1))?;
-      self.write_bit(((v - m) & 1) != 0)
+      Ok(())
     }
   }
   fn write_subexpfin(

From f3227485d059db8b5cb26dcc309ce1c4fbe9180e Mon Sep 17 00:00:00 2001
From: Monty Montgomery <xiphmont@gmail.com>
Date: Thu, 14 May 2020 03:14:27 -0400
Subject: [PATCH 2/2] Fix for #2212; Crash when using 4 tiles for 1080p 4:2:2
 input

When doing loop filter RDO inline with the rest of the tile coding,
LRUs must align to tile boundaries.  An unexpected corner case means
that chroma LRUs must have an even superblock width in 4:2:2 video, as
LRUs must always be square.  As a result, that means tiles must also
have an even superblock width.

As tile width must be adjusted in this case, it also means we can't
use the spec's 'tile uniform spacing' mode, which would produce odd
superblock width tiles in, eg, 1080p 4:2:2 video. This patch also
implements explicit per-tile sizing the the frame OBU header.
---
 src/encoder.rs      |  3 +-
 src/header.rs       | 77 ++++++++++++++++++++++++++++++++++++---------
 src/tiling/tiler.rs | 38 +++++++++++++++++++++-
 3 files changed, 101 insertions(+), 17 deletions(-)

diff --git a/src/encoder.rs b/src/encoder.rs
index b4b0c8dd9c..b60163d88f 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -604,6 +604,7 @@ impl<T: Pixel> FrameInvariants<T> {
       frame_rate,
       TilingInfo::tile_log2(1, config.tile_cols).unwrap(),
       TilingInfo::tile_log2(1, config.tile_rows).unwrap(),
+      sequence.chroma_sampling == ChromaSampling::Cs422,
     );
 
     if config.tiles > 0 {
@@ -619,6 +620,7 @@ impl<T: Pixel> FrameInvariants<T> {
           frame_rate,
           tile_cols_log2,
           tile_rows_log2,
+          sequence.chroma_sampling == ChromaSampling::Cs422,
         );
 
         if tiling.rows * tiling.cols >= config.tiles {
@@ -2621,7 +2623,6 @@ fn encode_partition_topdown<T: Pixel, W: Writer>(
     let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
     cw.write_partition(w, tile_bo, partition, bsize);
   }
-
   match partition {
     PartitionType::PARTITION_NONE => {
       let part_decision = if !rdo_output.part_modes.is_empty() {
diff --git a/src/header.rs b/src/header.rs
index 3677c47e9f..102dccf1fb 100644
--- a/src/header.rs
+++ b/src/header.rs
@@ -12,6 +12,8 @@ use crate::context::*;
 use crate::ec::*;
 use crate::lrf::*;
 use crate::partition::*;
+use crate::tiling::MAX_TILE_WIDTH;
+use crate::util::Fixed;
 use crate::util::Pixel;
 
 use crate::DeblockState;
@@ -670,25 +672,70 @@ impl<W: io::Write> UncompressedHeader for BitWriter<W, BigEndian> {
       self.write_bit(fi.disable_frame_end_update_cdf)?;
     }
 
-    // tile <https://aomediacodec.github.io/av1-spec/#tile-info-syntax>
-    self.write_bit(true)?; // uniform_tile_spacing_flag
+    // tile
+    // <https://aomediacodec.github.io/av1-spec/#tile-info-syntax>
 
+    // Can we use the uniform spacing tile syntax?  'Uniform spacing'
+    // is a slight misnomer; it's more constrained than just a uniform
+    // spacing.
     let ti = &fi.tiling;
 
-    let cols_ones = ti.tile_cols_log2 - ti.min_tile_cols_log2;
-    for _ in 0..cols_ones {
-      self.write_bit(true);
-    }
-    if ti.tile_cols_log2 < ti.max_tile_cols_log2 {
-      self.write_bit(false);
-    }
+    if fi.sb_width.align_power_of_two_and_shift(ti.tile_cols_log2)
+      == ti.tile_width_sb
+      && fi.sb_height.align_power_of_two_and_shift(ti.tile_rows_log2)
+        == ti.tile_height_sb
+    {
+      // yes; our actual tile width/height setting (which is always
+      // currently uniform) also matches the constrained width/height
+      // calculation implicit in the uniform spacing flag.
 
-    let rows_ones = ti.tile_rows_log2 - ti.min_tile_rows_log2;
-    for _ in 0..rows_ones {
-      self.write_bit(true);
-    }
-    if ti.tile_rows_log2 < ti.max_tile_rows_log2 {
-      self.write_bit(false);
+      self.write_bit(true)?; // uniform_tile_spacing_flag
+
+      let cols_ones = ti.tile_cols_log2 - ti.min_tile_cols_log2;
+      for _ in 0..cols_ones {
+        self.write_bit(true);
+      }
+      if ti.tile_cols_log2 < ti.max_tile_cols_log2 {
+        self.write_bit(false);
+      }
+
+      let rows_ones = ti.tile_rows_log2 - ti.min_tile_rows_log2;
+      for _ in 0..rows_ones {
+        self.write_bit(true);
+      }
+      if ti.tile_rows_log2 < ti.max_tile_rows_log2 {
+        self.write_bit(false);
+      }
+    } else {
+      self.write_bit(false)?; // uniform_tile_spacing_flag
+      let mut sofar = 0;
+      let mut widest_tile_sb = 0;
+      for _ in 0..ti.cols {
+        let max = (MAX_TILE_WIDTH
+          >> if fi.sequence.use_128x128_superblock { 7 } else { 6 })
+        .min(fi.sb_width - sofar) as u16;
+        let this_sb_width = ti.tile_width_sb.min(fi.sb_width - sofar);
+        self.write_quniform(max, (this_sb_width - 1) as u16);
+        sofar += this_sb_width;
+        widest_tile_sb = widest_tile_sb.max(this_sb_width);
+      }
+
+      let max_tile_area_sb = if ti.min_tiles_log2 > 0 {
+        (fi.sb_height * fi.sb_width) >> (ti.min_tiles_log2 + 1)
+      } else {
+        fi.sb_height * fi.sb_width
+      };
+
+      let max_tile_height_sb = (max_tile_area_sb / widest_tile_sb).max(1);
+
+      sofar = 0;
+      for i in 0..ti.rows {
+        let max = max_tile_height_sb.min(fi.sb_height - sofar) as u16;
+        let this_sb_height = ti.tile_height_sb.min(fi.sb_height - sofar);
+
+        self.write_quniform(max, (this_sb_height - 1) as u16);
+        sofar += this_sb_height;
+      }
     }
 
     let tiles_log2 = ti.tile_cols_log2 + ti.tile_rows_log2;
diff --git a/src/tiling/tiler.rs b/src/tiling/tiler.rs
index 2dab411db2..daa3a5e9be 100644
--- a/src/tiling/tiler.rs
+++ b/src/tiling/tiler.rs
@@ -44,12 +44,14 @@ pub struct TilingInfo {
   pub min_tile_rows_log2: usize,
   pub max_tile_rows_log2: usize,
   pub sb_size_log2: usize,
+  pub min_tiles_log2: usize,
 }
 
 impl TilingInfo {
   pub fn from_target_tiles(
     sb_size_log2: usize, frame_width: usize, frame_height: usize,
     frame_rate: f64, tile_cols_log2: usize, tile_rows_log2: usize,
+    is_422_p: bool,
   ) -> Self {
     // <https://aomediacodec.github.io/av1-spec/#tile-info-syntax>
 
@@ -87,7 +89,26 @@ impl TilingInfo {
 
     let tile_cols_log2 =
       tile_cols_log2.max(min_tile_cols_log2).min(max_tile_cols_log2);
-    let tile_width_sb = sb_cols.align_power_of_two_and_shift(tile_cols_log2);
+    let tile_width_sb_pre =
+      sb_cols.align_power_of_two_and_shift(tile_cols_log2);
+
+    // If this is 4:2:2, our UV horizontal is subsampled but not our
+    // vertical.  Loop Restoration Units must be square, so they
+    // will always have an even number of horizontal superblocks. For
+    // tiles and LRUs to align, tile_width_sb must be even in 4:2:2
+    // video.
+
+    // This is only relevant when doing loop restoration RDO inline
+    // with block/superblock encoding, that is, where tiles are
+    // relevant.  If (when) we introduce optionally delaying loop-filter
+    // encode to after the partitioning loop, we won't need to make
+    // any 4:2:2 adjustment.
+
+    let tile_width_sb = if is_422_p {
+      (tile_width_sb_pre + 1) >> 1 << 1
+    } else {
+      tile_width_sb_pre
+    };
 
     let min_tile_rows_log2 = if min_tiles_log2 > tile_cols_log2 {
       min_tiles_log2 - tile_cols_log2
@@ -123,6 +144,7 @@ impl TilingInfo {
       min_tile_rows_log2,
       max_tile_rows_log2,
       sb_size_log2,
+      min_tiles_log2,
     }
   }
 
@@ -240,6 +262,7 @@ pub mod test {
       frame_rate,
       0,
       0,
+      false,
     );
     assert_eq!(1, ti.cols);
     assert_eq!(1, ti.rows);
@@ -253,6 +276,7 @@ pub mod test {
       frame_rate,
       1,
       1,
+      false,
     );
     assert_eq!(2, ti.cols);
     assert_eq!(2, ti.rows);
@@ -266,6 +290,7 @@ pub mod test {
       frame_rate,
       2,
       2,
+      false,
     );
     assert_eq!(3, ti.cols);
     assert_eq!(3, ti.rows);
@@ -280,6 +305,7 @@ pub mod test {
       frame_rate,
       10,
       8,
+      false,
     );
     assert_eq!(3, ti.cols);
     assert_eq!(3, ti.rows);
@@ -293,6 +319,7 @@ pub mod test {
       frame_rate,
       0,
       0,
+      false,
     );
     assert_eq!(1, ti.cols);
     assert_eq!(1, ti.rows);
@@ -336,6 +363,7 @@ pub mod test {
         frame_rate,
         1,
         1,
+        false,
       );
       let mut iter = ti.tile_iter_mut(&mut fs, &mut fb);
       assert_eq!(4, iter.len());
@@ -359,6 +387,7 @@ pub mod test {
         frame_rate,
         2,
         2,
+        false,
       );
       let mut iter = ti.tile_iter_mut(&mut fs, &mut fb);
       assert_eq!(9, iter.len());
@@ -406,6 +435,7 @@ pub mod test {
       fi.config.frame_rate(),
       2,
       2,
+      false,
     );
     let iter = ti.tile_iter_mut(&mut fs, &mut fb);
     let tile_states = iter.map(|ctx| ctx.ts).collect::<Vec<_>>();
@@ -484,6 +514,7 @@ pub mod test {
       fi.config.frame_rate(),
       2,
       2,
+      false,
     );
     let iter = ti.tile_iter_mut(&mut fs, &mut fb);
     let tbs = iter.map(|ctx| ctx.tb).collect::<Vec<_>>();
@@ -524,6 +555,7 @@ pub mod test {
         fi.config.frame_rate(),
         2,
         2,
+        false,
       );
       let iter = ti.tile_iter_mut(&mut fs, &mut fb);
       let mut tile_states = iter.map(|ctx| ctx.ts).collect::<Vec<_>>();
@@ -588,6 +620,7 @@ pub mod test {
       fi.config.frame_rate(),
       2,
       2,
+      false,
     );
     let iter = ti.tile_iter_mut(&mut fs, &mut fb);
     let mut tile_states = iter.map(|ctx| ctx.ts).collect::<Vec<_>>();
@@ -628,6 +661,7 @@ pub mod test {
         fi.config.frame_rate(),
         1,
         1,
+        false,
       );
       let iter = ti.tile_iter_mut(&mut fs, &mut fb);
       let mut tile_states = iter.map(|ctx| ctx.ts).collect::<Vec<_>>();
@@ -690,6 +724,7 @@ pub mod test {
         fi.config.frame_rate(),
         2,
         2,
+        false,
       );
       let iter = ti.tile_iter_mut(&mut fs, &mut fb);
       let mut tile_states = iter.map(|ctx| ctx.ts).collect::<Vec<_>>();
@@ -734,6 +769,7 @@ pub mod test {
         fi.config.frame_rate(),
         2,
         2,
+        false,
       );
       let iter = ti.tile_iter_mut(&mut fs, &mut fb);
       let mut tbs = iter.map(|ctx| ctx.tb).collect::<Vec<_>>();