huggingface · mht-sharma · Feb 7, 2023 · Feb 3, 2023 · Feb 3, 2023 · Feb 3, 2023
diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
@@ -62,3 +62,7 @@ specific language governing permissions and limitations under the License.
 ## ORTModelForTokenClassification
 
 [[autodoc]] onnxruntime.ORTModelForTokenClassification
+
+## ORTModelForVision2Seq
+
+[[autodoc]] onnxruntime.ORTModelForVision2Seq
diff --git a/docs/source/onnxruntime/usage_guides/pipelines.mdx b/docs/source/onnxruntime/usage_guides/pipelines.mdx
@@ -24,6 +24,7 @@ Currently the supported tasks are:
 * `translation`
 * `image-classification`
 * `automatic-speech-recognition`
+* `image-to-text`
 
 ## Optimum pipeline usage
 

diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
@@ -39,7 +39,7 @@
  "ORTModelForSequenceClassification",
  "ORTModelForTokenClassification",
  ],
- "modeling_seq2seq": ["ORTModelForSeq2SeqLM", "ORTModelForSpeechSeq2Seq"],
+ "modeling_seq2seq": ["ORTModelForSeq2SeqLM", "ORTModelForSpeechSeq2Seq", "ORTModelForVision2Seq"],
  "modeling_decoder": ["ORTModelForCausalLM"],
  "optimization": ["ORTOptimizer"],
  "quantization": ["ORTQuantizer"],

diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py
@@ -115,39 +115,40 @@ def __init__(
  if len(self.key_value_output_names) == 0:
  self.key_value_output_names = [key for key in self.output_names if "key_values" in key]
 
- if len(self.key_value_output_names) == 0:
+ if self.parent_model.use_cache is True and len(self.key_value_output_names) == 0:
  raise RuntimeError("Could not find the past key values in the provided model.")
 
- # Attributes useful when computing the past key/values output shapes.
- self.expected_key_symbolic_shape = None
- self.expected_value_symbolic_shape = None
- for output in self.session.get_outputs():
- if ".key" in output.name:
- self.expected_key_symbolic_shape = output.shape
- elif ".value" in output.name:
- self.expected_value_symbolic_shape = output.shape
- # To handle the old case when past_key_values were following the format: past_key_values_{idx}
- elif "key_values" in output.name:
- if self.expected_key_symbolic_shape is None:
+ if len(self.key_value_output_names) != 0:
+ # Attributes useful when computing the past key/values output shapes.
+ self.expected_key_symbolic_shape = None
+ self.expected_value_symbolic_shape = None
+ for output in self.session.get_outputs():
+ if ".key" in output.name:
  self.expected_key_symbolic_shape = output.shape
- else:
+ elif ".value" in output.name:
  self.expected_value_symbolic_shape = output.shape
- if self.expected_key_symbolic_shape is not None and self.expected_value_symbolic_shape is not None:
- break
-
- self.key_sequence_length_idx = -2
- if (
- isinstance(self.expected_key_symbolic_shape[-1], str)
- and "sequence_length" in self.expected_key_symbolic_shape[-1]
- ):
- self.key_sequence_length_idx = -1
-
- self.value_sequence_length_idx = -2
- if (
- isinstance(self.expected_value_symbolic_shape[-1], str)
- and "sequence_length" in self.expected_value_symbolic_shape[-1]
- ):
- self.value_sequence_length_idx = -1
+ # To handle the old case when past_key_values were following the format: past_key_values_{idx}
+ elif "key_values" in output.name:
+ if self.expected_key_symbolic_shape is None:
+ self.expected_key_symbolic_shape = output.shape
+ else:
+ self.expected_value_symbolic_shape = output.shape
+ if self.expected_key_symbolic_shape is not None and self.expected_value_symbolic_shape is not None:
+ break
+
+ self.key_sequence_length_idx = -2
+ if (
+ isinstance(self.expected_key_symbolic_shape[-1], str)
+ and "sequence_length" in self.expected_key_symbolic_shape[-1]
+ ):
+ self.key_sequence_length_idx = -1
+
+ self.value_sequence_length_idx = -2
+ if (
+ isinstance(self.expected_value_symbolic_shape[-1], str)
+ and "sequence_length" in self.expected_value_symbolic_shape[-1]
+ ):
+ self.value_sequence_length_idx = -1
 
  def compute_past_key_values_output_shapes(
  self, input_ids: torch.Tensor, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None

diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
@@ -58,7 +58,7 @@
  attention_mask (`torch.LongTensor`, *optional*):
  Mask to avoid performing attention on padding token indices, of shape
  `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`.
- past_key_values (`tuple(tuple(torch.FloatTensor), *optional*)`
+ past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)`
  Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding.
  The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape
  `(batch_size, num_heads, sequence_length, embed_size_per_head)`.
@@ -71,7 +71,7 @@
  attention_mask (`torch.LongTensor`):
  Mask to avoid performing attention on padding token indices, of shape
  `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`.
- past_key_values (`tuple(tuple(torch.FloatTensor), *optional*)`
+ past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)`
  Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding.
  The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape
  `(batch_size, num_heads, sequence_length, embed_size_per_head)`.