feat: Add support for V1 and V2 classification models for the V1Beta2…

… API (#697) * feat: Add support for V1 and V2 classification models for the V1 API PiperOrigin-RevId: 475599241 Source-Link: googleapis/googleapis@05b99f9 Source-Link: googleapis/googleapis-gen@3dcdbed Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiM2RjZGJlZDhkOTY4ZjYzNGJlMGEyZDMxMDcyMzdkMjMyZWU4YjA2MSJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: Add support for V1 and V2 classification models for the V1Beta2 API PiperOrigin-RevId: 475604619 Source-Link: googleapis/googleapis@044a15c Source-Link: googleapis/googleapis-gen@410020a Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiNDEwMDIwYWY5MzRjNzI0OGY3ODA0NzcwZDZmOGVjNDU3MWJmYTU1MSJ9 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
googleapis · Oct 13, 2022 · ea03a55 · ea03a55
1 parent e62b990
commit ea03a55
Show file tree

Hide file tree

Showing 16 changed files with 7,466 additions and 4,993 deletions.
diff --git a/packages/google-cloud-language/protos/google/cloud/language/v1/language_service.proto b/packages/google-cloud-language/protos/google/cloud/language/v1/language_service.proto
diff --git a/packages/google-cloud-language/protos/google/cloud/language/v1beta2/language_service.proto b/packages/google-cloud-language/protos/google/cloud/language/v1beta2/language_service.proto
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC.
+// Copyright 2022 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,7 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-//
 
 syntax = "proto3";
 
@@ -68,7 +67,7 @@ service LanguageService {
  }
 
  // Analyzes the syntax of the text and provides sentence boundaries and
- // tokenization along with part-of-speech tags, dependency trees, and other
+ // tokenization along with part of speech tags, dependency trees, and other
  // properties.
  rpc AnalyzeSyntax(AnalyzeSyntaxRequest) returns (AnalyzeSyntaxResponse) {
  option (google.api.http) = {
@@ -100,7 +99,7 @@ service LanguageService {
  }
 }
 
-
+// ################################################################ #
 //
 // Represents the input to API methods.
 message Document {
@@ -116,6 +115,19 @@ message Document {
  HTML = 2;
  }
 
+ // Ways of handling boilerplate detected in the document
+ enum BoilerplateHandling {
+ // The boilerplate handling is not specified.
+ BOILERPLATE_HANDLING_UNSPECIFIED = 0;
+
+ // Do not analyze detected boilerplate. Reference web URI is required for
+ // detecting boilerplate.
+ SKIP_BOILERPLATE = 1;
+
+ // Treat boilerplate the same as content.
+ KEEP_BOILERPLATE = 2;
+ }
+
  // Required. If the type is not set or is `TYPE_UNSPECIFIED`,
  // returns an `INVALID_ARGUMENT` error.
  Type type = 1;
@@ -143,6 +155,15 @@ message Document {
  // specified by the caller or automatically detected) is not supported by the
  // called API method, an `INVALID_ARGUMENT` error is returned.
  string language = 4;
+
+ // The web URI where the document comes from. This URI is not used for
+ // fetching the content, but as a hint for analyzing the document.
+ string reference_web_uri = 5;
+
+ // Indicates how detected boilerplate(e.g. advertisements, copyright
+ // declarations, banners) should be handled for this document. If not
+ // specified, boilerplate will be treated the same as content.
+ BoilerplateHandling boilerplate_handling = 6;
 }
 
 // Represents a sentence in the input document.
@@ -156,6 +177,32 @@ message Sentence {
  Sentiment sentiment = 2;
 }
 
+// Represents the text encoding that the caller uses to process the output.
+// Providing an `EncodingType` is recommended because the API provides the
+// beginning offsets for various outputs, such as tokens and mentions, and
+// languages that natively use different text encodings may access offsets
+// differently.
+enum EncodingType {
+ // If `EncodingType` is not specified, encoding-dependent information (such as
+ // `begin_offset`) will be set at `-1`.
+ NONE = 0;
+
+ // Encoding-dependent information (such as `begin_offset`) is calculated based
+ // on the UTF-8 encoding of the input. C++ and Go are examples of languages
+ // that use this encoding natively.
+ UTF8 = 1;
+
+ // Encoding-dependent information (such as `begin_offset`) is calculated based
+ // on the UTF-16 encoding of the input. Java and JavaScript are examples of
+ // languages that use this encoding natively.
+ UTF16 = 2;
+
+ // Encoding-dependent information (such as `begin_offset`) is calculated based
+ // on the UTF-32 encoding of the input. Python is an example of a language
+ // that uses this encoding natively.
+ UTF32 = 3;
+}
+
 // Represents a phrase in the text that is a known entity, such as
 // a person, an organization, or location. The API associates information, such
 // as salience and mentions, with entities.
@@ -286,32 +333,6 @@ message Token {
  string lemma = 4;
 }
 
-// Represents the text encoding that the caller uses to process the output.
-// Providing an `EncodingType` is recommended because the API provides the
-// beginning offsets for various outputs, such as tokens and mentions, and
-// languages that natively use different text encodings may access offsets
-// differently.
-enum EncodingType {
- // If `EncodingType` is not specified, encoding-dependent information (such as
- // `begin_offset`) will be set at `-1`.
- NONE = 0;
-
- // Encoding-dependent information (such as `begin_offset`) is calculated based
- // on the UTF-8 encoding of the input. C++ and Go are examples of languages
- // that use this encoding natively.
- UTF8 = 1;
-
- // Encoding-dependent information (such as `begin_offset`) is calculated based
- // on the UTF-16 encoding of the input. Java and JavaScript are examples of
- // languages that use this encoding natively.
- UTF16 = 2;
-
- // Encoding-dependent information (such as `begin_offset`) is calculated based
- // on the UTF-32 encoding of the input. Python is an example of a language
- // that uses this encoding natively.
- UTF32 = 3;
-}
-
 // Represents the feeling associated with the entire text or entities in
 // the text.
 // Next ID: 6
@@ -968,6 +989,45 @@ message ClassificationCategory {
  float confidence = 2;
 }
 
+// Model options available for classification requests.
+message ClassificationModelOptions {
+ // Options for the V1 model.
+ message V1Model {
+
+ }
+
+ // Options for the V2 model.
+ message V2Model {
+ // The content categories used for classification.
+ enum ContentCategoriesVersion {
+ // If `ContentCategoriesVersion` is not specified, this option will
+ // default to `V1`.
+ CONTENT_CATEGORIES_VERSION_UNSPECIFIED = 0;
+
+ // Legacy content categories of our initial launch in 2017.
+ V1 = 1;
+
+ // Updated content categories in 2022.
+ V2 = 2;
+ }
+
+ // The content categories used for classification.
+ ContentCategoriesVersion content_categories_version = 1;
+ }
+
+ // If this field is not set, then the `v1_model` will be used by default.
+ oneof model_type {
+ // Setting this field will use the V1 model and V1 content categories
+ // version. The V1 model is a legacy model; support for this will be
+ // discontinued in the future.
+ V1Model v1_model = 1;
+
+ // Setting this field will use the V2 model with the appropriate content
+ // categories version. The V2 model is a better performing model.
+ V2Model v2_model = 2;
+ }
+}
+
 // The sentiment analysis request message.
 message AnalyzeSentimentRequest {
  // Required. Input document.
@@ -1059,6 +1119,10 @@ message AnalyzeSyntaxResponse {
 message ClassifyTextRequest {
  // Required. Input document.
  Document document = 1 [(google.api.field_behavior) = REQUIRED];
+
+ // Model options to use for classification. Defaults to v1 options if not
+ // specified.
+ ClassificationModelOptions classification_model_options = 3;
 }
 
 // The document classification response message.
@@ -1072,7 +1136,7 @@ message ClassifyTextResponse {
 message AnnotateTextRequest {
  // All available features for sentiment, syntax, and semantic analysis.
  // Setting each one to true will enable that specific analysis for the input.
- // Next ID: 10
+ // Next ID: 11
  message Features {
  // Extract syntax information.
  bool extract_syntax = 1;
@@ -1091,6 +1155,10 @@ message AnnotateTextRequest {
  // [predefined
  // taxonomy](https://cloud.google.com/natural-language/docs/categories).
  bool classify_text = 6;
+
+ // The model options to use for classification. Defaults to v1 options
+ // if not specified. Only used if `classify_text` is set to true.
+ ClassificationModelOptions classification_model_options = 10;
  }
 
  // Required. Input document.