Feat (graph/gpfq): compression with random projection (#964)

Xilinx · May 31, 2024 · 0f60606 · 0f60606
1 parent 8c71e08
commit 0f60606
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 12 deletions.
diff --git a/src/brevitas/graph/gpfq.py b/src/brevitas/graph/gpfq.py
@@ -2,10 +2,14 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 from copy import deepcopy
+import math
+from math import pi
 from typing import Callable, List, Optional
 
 import numpy as np
 import torch
+from torch.fft import fft
+from torch.fft import fftn
 import torch.nn as nn
 import unfoldNd
 
@@ -19,6 +23,24 @@
 import brevitas.nn as qnn
 
 
+def random_projection(
+ float_input: torch.Tensor, quantized_input: torch.Tensor, compression_rate: float):
+ # use random projection to reduce dimensionality
+ n = quantized_input.size(1)
+ target_dim = int(compression_rate * n)
+ dev = float_input.device
+ # create gaussian random matrix
+ R = torch.normal(mean=0.0, std=1. / math.sqrt(n), size=(target_dim, n), device=dev)
+ quantized_input = torch.transpose(quantized_input, 1, 2) @ R.T
+ float_input = torch.transpose(float_input, 1, 2) @ R.T
+ del R
+ # reshape back
+ quantized_input = torch.transpose(quantized_input, 1, 2)
+ float_input = torch.transpose(float_input, 1, 2)
+
+ return float_input, quantized_input
+
+
 class gpfq_mode(gpxq_mode):
  """
  Apply GPFQ algorithm.
@@ -64,7 +86,8 @@ def __init__(
  act_order: bool = False,
  use_gpfa2q: bool = False,
  accumulator_bit_width: Optional[int] = None,
- a2q_layer_filter_fnc: Optional[Callable[[nn.Module], bool]] = lambda x: True) -> None:
+ a2q_layer_filter_fnc: Optional[Callable[[nn.Module], bool]] = lambda x: True,
+ compression_rate: Optional[float] = 0.0) -> None:
  if not inplace:
  model = deepcopy(model)
  super().__init__(
@@ -83,6 +106,11 @@ def __init__(
  self.accumulator_bit_width = accumulator_bit_width
  self.a2q_layer_filter_fnc = a2q_layer_filter_fnc # returns true when to use GPFA2Q
 
+ # selecting impl of random proj
+ self.compression_rate = compression_rate
+ if self.compression_rate < 0.0 or self.compression_rate > 1.0:
+ raise ValueError('Compression rate for random projection must be between 0 and 1.')
+
  def catch_stopfwd(self, *args, **kwargs):
  # Collect quant input
  try:
@@ -127,7 +155,8 @@ def initialize_module_optimizer(
  act_order=act_order,
  len_parallel_layers=len_parallel_layers,
  create_weight_orig=create_weight_orig,
- p=self.p)
+ p=self.p,
+ compression_rate=self.compression_rate)
  else:
  return GPFA2Q(
  layer=layer,
@@ -136,22 +165,26 @@ def initialize_module_optimizer(
  len_parallel_layers=len_parallel_layers,
  create_weight_orig=create_weight_orig,
  p=self.p,
- accumulator_bit_width=self.accumulator_bit_width)
+ accumulator_bit_width=self.accumulator_bit_width,
+ compression_rate=self.compression_rate)
 
 
 class GPFQ(GPxQ):
  """
  Based on https://github.com/YixuanSeanZhou/Quantized_Neural_Nets/tree/main
  """
 
- def __init__(self, layer, name, act_order, len_parallel_layers, create_weight_orig, p) -> None:
+ def __init__(
+ self, layer, name, act_order, len_parallel_layers, create_weight_orig, p,
+ compression_rate) -> None:
 
  super().__init__(layer, name, act_order, len_parallel_layers, create_weight_orig)
 
  self.float_input = None
  self.quantized_input = None
  self.index_computed = False
  self.p = p
+ self.compression_rate = compression_rate
 
  def update_batch(self, module, input, current_layer):
  if self.disable_pre_forward_hook:
@@ -246,10 +279,12 @@ def single_layer_update(self):
  weight = weight.transpose(1, 0) # This performs a view
  weight = weight.flatten(1)
  weight = weight.view(self.groups, -1, weight.shape[-1]) # [Groups, OC/Groups, IC]
- U = torch.zeros(
- weight.shape[0], weight.shape[1], self.float_input.shape[1], device=dev, dtype=dtype)
+ if self.compression_rate > 0.0:
+ self.float_input, self.quantized_input = random_projection(self.float_input, self.quantized_input, self.compression_rate)
  self.float_input = self.float_input.to(dev)
  self.quantized_input = self.quantized_input.to(dev)
+ U = torch.zeros(
+ weight.shape[0], weight.shape[1], self.float_input.shape[1], device=dev, dtype=dtype)
  # We don't need full Hessian, we just need the diagonal
  self.H_diag = self.quantized_input.transpose(2, 1).square().sum(
  2) # summing over Batch dimension
@@ -300,15 +335,17 @@ def __init__(
  len_parallel_layers,
  create_weight_orig,
  accumulator_bit_width,
- p) -> None:
+ p,
+ compression_rate) -> None:
  GPFQ.__init__(
  self,
  layer=layer,
  name=name,
  act_order=act_order,
  len_parallel_layers=len_parallel_layers,
  create_weight_orig=create_weight_orig,
- p=p)
+ p=p,
+ compression_rate=compression_rate)
  self.accumulator_bit_width = accumulator_bit_width
  assert self.accumulator_bit_width is not None
 
@@ -329,6 +366,8 @@ def single_layer_update(self):
  weight = weight.view(self.groups, -1, weight.shape[-1]) # [Groups, OC/Groups, IC]
  U = torch.zeros(
  weight.shape[0], weight.shape[1], self.float_input.shape[1], device=dev, dtype=dtype)
+ if self.compression_rate > 0.0:
+ self.float_input, self.quantized_input = random_projection(self.float_input, self.quantized_input, self.compression_rate)
  self.float_input = self.float_input.to(dev)
  self.quantized_input = self.quantized_input.to(dev)
 

diff --git a/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py b/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py
@@ -535,7 +535,14 @@ def apply_gptq(calib_loader, model, act_order=False):
  gptq.update()
 
 
-def apply_gpfq(calib_loader, model, act_order, p=1.0, use_gpfa2q=False, accumulator_bit_width=None):
+def apply_gpfq(
+ calib_loader,
+ model,
+ act_order,
+ p=1.0,
+ use_gpfa2q=False,
+ accumulator_bit_width=None,
+ compression_rate=0.0):
  model.eval()
  dtype = next(model.parameters()).dtype
  device = next(model.parameters()).device
@@ -545,7 +552,8 @@ def apply_gpfq(calib_loader, model, act_order, p=1.0, use_gpfa2q=False, accumula
  use_quant_activations=True,
  act_order=act_order,
  use_gpfa2q=use_gpfa2q,
- accumulator_bit_width=accumulator_bit_width) as gpfq:
+ accumulator_bit_width=accumulator_bit_width,
+ compression_rate=compression_rate) as gpfq:
  gpfq_model = gpfq.model
  for i in tqdm(range(gpfq.num_layers)):
  for i, (images, target) in enumerate(calib_loader):

diff --git a/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py b/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py
@@ -239,6 +239,12 @@ def parse_type(v, default_type):
  help=
  'Split Ratio for Channel Splitting. When set to 0.0, Channel Splitting will not be applied. (default: 0.0)'
 )
+parser.add_argument(
+ '--compression-rate',
+ default=0.0,
+ type=float,
+ help='Specify compression rate < 1.0 for random projection. Default is 0.0 and does not use RP.'
+)
 add_bool_arg(parser, 'gptq', default=False, help='GPTQ (default: disabled)')
 add_bool_arg(parser, 'gpfq', default=False, help='GPFQ (default: disabled)')
 add_bool_arg(parser, 'gpfa2q', default=False, help='GPFA2Q (default: disabled)')
@@ -426,7 +432,12 @@ def main():
 
  if args.gpfq:
  print("Performing GPFQ:")
- apply_gpfq(calib_loader, quant_model, p=args.gpfq_p, act_order=args.gpxq_act_order)
+ apply_gpfq(
+ calib_loader,
+ quant_model,
+ p=args.gpfq_p,
+ act_order=args.gpxq_act_order,
+ compression_rate=args.compression_rate)
 
  if args.gpfa2q:
  print("Performing GPFA2Q:")
@@ -436,7 +447,8 @@ def main():
  p=args.gpfq_p,
  act_order=args.gpxq_act_order,
  use_gpfa2q=args.gpfa2q,
- accumulator_bit_width=args.accumulator_bit_width)
+ accumulator_bit_width=args.accumulator_bit_width,
+ compression_rate=args.compression_rate)
 
  if args.gptq:
  print("Performing GPTQ:")