Remove tensorflow_text dependency on tf_hub library.

PiperOrigin-RevId: 602963886
tensorflow · Jan 31, 2024 · e9d1697 · e9d1697
1 parent 1bada8d
commit e9d1697
Show file tree

Hide file tree

Showing 8 changed files with 37 additions and 48 deletions.
diff --git a/WORKSPACE b/WORKSPACE
@@ -54,20 +54,6 @@ http_archive(
     ],
 )
 
-# NOTE: according to
-# https://docs.bazel.build/versions/master/external.html#transitive-dependencies
-# we should list the transitive dependencies of @org_tensorflow_hub in this
-# WORKSPACE file.  Still, all of them are already listed by tf_workspace() which
-# is called later in this file.
-http_archive(
-    name = "org_tensorflow_hub",
-    strip_prefix = "hub-c83a2362abdad2baae67508aa17efb42bf7c7dd6",
-    sha256 = "b7f5e53605a3b2d6da827b817385d476017300e8e14cccc91b269c86d5f5a99f",
-    urls = [
-        "https://github.com/tensorflow/hub/archive/c83a2362abdad2baae67508aa17efb42bf7c7dd6.zip"
-    ],
-)
-
 http_archive(
     name = "org_tensorflow",
     patch_args = ["-p1"],

diff --git a/docs/api_docs/python/text/HubModuleSplitter.md b/docs/api_docs/python/text/HubModuleSplitter.md
@@ -41,10 +41,11 @@ as the support for ragged tensors and high-rank tensors.
 The Hub module should be supported by `hub.load()
 <https://www.tensorflow.org/hub/api_docs/python/hub/load>`_ If a v1 module, it
 should have a graph variant with an empty set of tags; we consider that graph
-variant to be the module and ignore everything else.  The module should have a
-signature named `default` that takes a <a href="../text.md"><code>text</code></a> input (a rank-1 tensor of
-strings to split into pieces) and returns a dictionary of tensors, let's say
-`output_dict`, such that:
+variant to be the module and ignore everything else. The module should have a
+signature named `default` that takes a
+<a href="../text.md"><code>text</code></a> input (a rank-1 tensor of strings to
+split into pieces) and returns a dictionary of tensors, let's say `output_dict`,
+such that:
 
 * `output_dict['num_pieces']` is a rank-1 tensor of integers, where
 num_pieces[i] is the number of pieces that text[i] was split into.
@@ -68,7 +69,7 @@ class is not using them.
 
 ```
 >>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
->>> segmenter = HubModuleSplitter(HUB_MODULE)
+>>> segmenter = HubModuleSplitter(hub.resolve(HUB_MODULE))
 >>> segmenter.split(["新华社北京"])
 <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
                    b'\xe5\x8c\x97\xe4\xba\xac']]>
@@ -78,7 +79,7 @@ You can also use this tokenizer to return the split strings and their offsets:
 
 ```
 >>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
->>> segmenter = HubModuleSplitter(HUB_MODULE)
+>>> segmenter = HubModuleSplitter(hub.resolve(HUB_MODULE))
 >>> pieces, starts, ends = segmenter.split_with_offsets(["新华社北京"])
 >>> print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
 pieces: <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',

diff --git a/docs/api_docs/python/text/HubModuleTokenizer.md b/docs/api_docs/python/text/HubModuleTokenizer.md
@@ -46,7 +46,7 @@ tokenize() instead of the more general and less informatively named split().
 
 ```
 >>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
->>> segmenter = HubModuleTokenizer(HUB_MODULE)
+>>> segmenter = HubModuleTokenizer(hub.resolve(HUB_MODULE))
 >>> segmenter.tokenize(["新华社北京"])
 <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
                    b'\xe5\x8c\x97\xe4\xba\xac']]>
@@ -56,7 +56,7 @@ You can also use this tokenizer to return the split strings and their offsets:
 
 ```
 >>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
->>> segmenter = HubModuleTokenizer(HUB_MODULE)
+>>> segmenter = HubModuleTokenizer(hub.resolve(HUB_MODULE))
 >>> pieces, starts, ends = segmenter.tokenize_with_offsets(["新华社北京"])
 >>> print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
 pieces: <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',

diff --git a/oss_scripts/pip_package/setup.nightly.py b/oss_scripts/pip_package/setup.nightly.py
@@ -73,7 +73,6 @@ def finalize_options(self):
     cmdclass={'install': InstallPlatlib},
     distclass=BinaryDistribution,
     install_requires=[
-        'tensorflow_hub>=0.15.0',
     ],
     extras_require={
         'tests': [

diff --git a/oss_scripts/pip_package/setup.py b/oss_scripts/pip_package/setup.py
@@ -81,7 +81,6 @@ def finalize_options(self):
             'tensorflow-macos>=2.15.0, <2.16; platform_machine == "arm64" and'
             ' platform_system == "Darwin"'
         ),
-        'tensorflow_hub>=0.15.0',
     ],
     extras_require={
         'tensorflow_cpu': [

diff --git a/tensorflow_text/BUILD b/tensorflow_text/BUILD
@@ -1,9 +1,8 @@
 # Placeholder: load py_library
 # Placeholder: load py_test
+load("@org_tensorflow//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object")
 load("//tensorflow_text:tftext.bzl", "py_tf_text_library")
-
 # [internal] load build_test.bzl
-load("@org_tensorflow//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object")
 
 # Visibility rules
 package(
@@ -527,9 +526,10 @@ py_library(
     srcs = ["python/ops/hub_module_splitter.py"],
     deps = [
         ":splitter",
-        "//third_party/py/tensorflow_hub",
+        # python/eager:monitoring tensorflow dep,
         # python/ops:array_ops_stack tensorflow dep,
         # python/ops/ragged:ragged_tensor tensorflow dep,
+        # python/saved_model:load tensorflow dep,
     ],
 )
 

diff --git a/tensorflow_text/python/ops/hub_module_splitter.py b/tensorflow_text/python/ops/hub_module_splitter.py
@@ -15,10 +15,10 @@
 
 """Splitter that uses a Hub module."""
 
-import tensorflow_hub as hub
 from tensorflow.python.eager import monitoring
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.saved_model import load
 from tensorflow_text.python.ops.splitter import SplitterWithOffsets
 
 _tf_text_hub_module_splitter_create_counter = monitoring.Counter(
@@ -61,18 +61,20 @@ class is not using them.
 
   Example:
 
-  >>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
-  >>> segmenter = HubModuleSplitter(HUB_MODULE)
-  >>> segmenter.split(["新华社北京"])
+  import tensorflow_hub as hub
+  HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
+  segmenter = HubModuleSplitter(hub.resolve(HUB_MODULE))
+  segmenter.split(["新华社北京"])
   <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
                      b'\xe5\x8c\x97\xe4\xba\xac']]>
 
   You can also use this tokenizer to return the split strings and their offsets:
 
-  >>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
-  >>> segmenter = HubModuleSplitter(HUB_MODULE)
-  >>> pieces, starts, ends = segmenter.split_with_offsets(["新华社北京"])
-  >>> print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
+  import tensorflow_hub as hub
+  HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
+  segmenter = HubModuleSplitter(hub.resolve(HUB_MODULE))
+  pieces, starts, ends = segmenter.split_with_offsets(["新华社北京"])
+  print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
   pieces: <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
                              b'\xe5\x8c\x97\xe4\xba\xac']]>
   starts: <tf.RaggedTensor [[0, 9]]>
@@ -88,15 +90,15 @@ def __init__(self, hub_module_handle):
     """Initializes a new HubModuleSplitter instance.
 
     Args:
-      hub_module_handle: A string handle accepted by hub.load().  Supported
-        cases include (1) a local path to a directory containing a module, and
-        (2) a handle to a module uploaded to e.g., https://tfhub.dev.  The
-        module should implement the signature described in the docstring for
-        this class.
+      hub_module_handle: A string handle accepted by tf.saved_model.load().
+        Supported cases include a local path to a directory containing a module.
+        If a model is stored on https://tfhub.dev, call hub.resolve() to
+        download the model locally. The module should implement the signature
+        described in the docstring for this class.
     """
     super(HubModuleSplitter, self).__init__()
     empty_tags = set()
-    self._hub_module = hub.load(hub_module_handle, tags=empty_tags)
+    self._hub_module = load.load(hub_module_handle, tags=empty_tags)
     self._hub_module_signature = self._hub_module.signatures['default']
     _tf_text_hub_module_splitter_create_counter.get_cell().increase_by(1)
 

diff --git a/tensorflow_text/python/ops/hub_module_tokenizer.py b/tensorflow_text/python/ops/hub_module_tokenizer.py
@@ -33,18 +33,20 @@ class HubModuleTokenizer(TokenizerWithOffsets):
 
   Example:
 
-  >>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
-  >>> segmenter = HubModuleTokenizer(HUB_MODULE)
-  >>> segmenter.tokenize(["新华社北京"])
+  import tensorflow_hub as hub
+  HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
+  segmenter = HubModuleTokenizer(hub.resolve(HUB_MODULE))
+  segmenter.tokenize(["新华社北京"])
   <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
                      b'\xe5\x8c\x97\xe4\xba\xac']]>
 
   You can also use this tokenizer to return the split strings and their offsets:
 
-  >>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
-  >>> segmenter = HubModuleTokenizer(HUB_MODULE)
-  >>> pieces, starts, ends = segmenter.tokenize_with_offsets(["新华社北京"])
-  >>> print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
+  import tensorflow_hub as hub
+  HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
+  segmenter = HubModuleTokenizer(hub.resolve(HUB_MODULE))
+  pieces, starts, ends = segmenter.tokenize_with_offsets(["新华社北京"])
+  print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
   pieces: <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
                              b'\xe5\x8c\x97\xe4\xba\xac']]>
   starts: <tf.RaggedTensor [[0, 9]]>