Skip to content

Commit

Permalink
Remove tensorflow_text dependency on tf_hub library.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 602963886
  • Loading branch information
tf-text-github-robot committed Jan 31, 2024
1 parent 1bada8d commit e9d1697
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 48 deletions.
14 changes: 0 additions & 14 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,6 @@ http_archive(
],
)

# NOTE: according to
# https://docs.bazel.build/versions/master/external.html#transitive-dependencies
# we should list the transitive dependencies of @org_tensorflow_hub in this
# WORKSPACE file. Still, all of them are already listed by tf_workspace() which
# is called later in this file.
http_archive(
name = "org_tensorflow_hub",
strip_prefix = "hub-c83a2362abdad2baae67508aa17efb42bf7c7dd6",
sha256 = "b7f5e53605a3b2d6da827b817385d476017300e8e14cccc91b269c86d5f5a99f",
urls = [
"https://github.com/tensorflow/hub/archive/c83a2362abdad2baae67508aa17efb42bf7c7dd6.zip"
],
)

http_archive(
name = "org_tensorflow",
patch_args = ["-p1"],
Expand Down
13 changes: 7 additions & 6 deletions docs/api_docs/python/text/HubModuleSplitter.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ as the support for ragged tensors and high-rank tensors.
The Hub module should be supported by `hub.load()
<https://www.tensorflow.org/hub/api_docs/python/hub/load>`_ If a v1 module, it
should have a graph variant with an empty set of tags; we consider that graph
variant to be the module and ignore everything else. The module should have a
signature named `default` that takes a <a href="../text.md"><code>text</code></a> input (a rank-1 tensor of
strings to split into pieces) and returns a dictionary of tensors, let's say
`output_dict`, such that:
variant to be the module and ignore everything else. The module should have a
signature named `default` that takes a
<a href="../text.md"><code>text</code></a> input (a rank-1 tensor of strings to
split into pieces) and returns a dictionary of tensors, let's say `output_dict`,
such that:

* `output_dict['num_pieces']` is a rank-1 tensor of integers, where
num_pieces[i] is the number of pieces that text[i] was split into.
Expand All @@ -68,7 +69,7 @@ class is not using them.

```
>>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
>>> segmenter = HubModuleSplitter(HUB_MODULE)
>>> segmenter = HubModuleSplitter(hub.resolve(HUB_MODULE))
>>> segmenter.split(["新华社北京"])
<tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
b'\xe5\x8c\x97\xe4\xba\xac']]>
Expand All @@ -78,7 +79,7 @@ You can also use this tokenizer to return the split strings and their offsets:

```
>>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
>>> segmenter = HubModuleSplitter(HUB_MODULE)
>>> segmenter = HubModuleSplitter(hub.resolve(HUB_MODULE))
>>> pieces, starts, ends = segmenter.split_with_offsets(["新华社北京"])
>>> print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
pieces: <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
Expand Down
4 changes: 2 additions & 2 deletions docs/api_docs/python/text/HubModuleTokenizer.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ tokenize() instead of the more general and less informatively named split().

```
>>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
>>> segmenter = HubModuleTokenizer(HUB_MODULE)
>>> segmenter = HubModuleTokenizer(hub.resolve(HUB_MODULE))
>>> segmenter.tokenize(["新华社北京"])
<tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
b'\xe5\x8c\x97\xe4\xba\xac']]>
Expand All @@ -56,7 +56,7 @@ You can also use this tokenizer to return the split strings and their offsets:

```
>>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
>>> segmenter = HubModuleTokenizer(HUB_MODULE)
>>> segmenter = HubModuleTokenizer(hub.resolve(HUB_MODULE))
>>> pieces, starts, ends = segmenter.tokenize_with_offsets(["新华社北京"])
>>> print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
pieces: <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
Expand Down
1 change: 0 additions & 1 deletion oss_scripts/pip_package/setup.nightly.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def finalize_options(self):
cmdclass={'install': InstallPlatlib},
distclass=BinaryDistribution,
install_requires=[
'tensorflow_hub>=0.15.0',
],
extras_require={
'tests': [
Expand Down
1 change: 0 additions & 1 deletion oss_scripts/pip_package/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ def finalize_options(self):
'tensorflow-macos>=2.15.0, <2.16; platform_machine == "arm64" and'
' platform_system == "Darwin"'
),
'tensorflow_hub>=0.15.0',
],
extras_require={
'tensorflow_cpu': [
Expand Down
6 changes: 3 additions & 3 deletions tensorflow_text/BUILD
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
# Placeholder: load py_library
# Placeholder: load py_test
load("@org_tensorflow//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object")
load("//tensorflow_text:tftext.bzl", "py_tf_text_library")

# [internal] load build_test.bzl
load("@org_tensorflow//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object")

# Visibility rules
package(
Expand Down Expand Up @@ -527,9 +526,10 @@ py_library(
srcs = ["python/ops/hub_module_splitter.py"],
deps = [
":splitter",
"//third_party/py/tensorflow_hub",
# python/eager:monitoring tensorflow dep,
# python/ops:array_ops_stack tensorflow dep,
# python/ops/ragged:ragged_tensor tensorflow dep,
# python/saved_model:load tensorflow dep,
],
)

Expand Down
30 changes: 16 additions & 14 deletions tensorflow_text/python/ops/hub_module_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@

"""Splitter that uses a Hub module."""

import tensorflow_hub as hub
from tensorflow.python.eager import monitoring
from tensorflow.python.ops import array_ops_stack
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.saved_model import load
from tensorflow_text.python.ops.splitter import SplitterWithOffsets

_tf_text_hub_module_splitter_create_counter = monitoring.Counter(
Expand Down Expand Up @@ -61,18 +61,20 @@ class is not using them.
Example:
>>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
>>> segmenter = HubModuleSplitter(HUB_MODULE)
>>> segmenter.split(["新华社北京"])
import tensorflow_hub as hub
HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
segmenter = HubModuleSplitter(hub.resolve(HUB_MODULE))
segmenter.split(["新华社北京"])
<tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
b'\xe5\x8c\x97\xe4\xba\xac']]>
You can also use this tokenizer to return the split strings and their offsets:
>>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
>>> segmenter = HubModuleSplitter(HUB_MODULE)
>>> pieces, starts, ends = segmenter.split_with_offsets(["新华社北京"])
>>> print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
import tensorflow_hub as hub
HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
segmenter = HubModuleSplitter(hub.resolve(HUB_MODULE))
pieces, starts, ends = segmenter.split_with_offsets(["新华社北京"])
print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
pieces: <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
b'\xe5\x8c\x97\xe4\xba\xac']]>
starts: <tf.RaggedTensor [[0, 9]]>
Expand All @@ -88,15 +90,15 @@ def __init__(self, hub_module_handle):
"""Initializes a new HubModuleSplitter instance.
Args:
hub_module_handle: A string handle accepted by hub.load(). Supported
cases include (1) a local path to a directory containing a module, and
(2) a handle to a module uploaded to e.g., https://tfhub.dev. The
module should implement the signature described in the docstring for
this class.
hub_module_handle: A string handle accepted by tf.saved_model.load().
Supported cases include a local path to a directory containing a module.
If a model is stored on https://tfhub.dev, call hub.resolve() to
download the model locally. The module should implement the signature
described in the docstring for this class.
"""
super(HubModuleSplitter, self).__init__()
empty_tags = set()
self._hub_module = hub.load(hub_module_handle, tags=empty_tags)
self._hub_module = load.load(hub_module_handle, tags=empty_tags)
self._hub_module_signature = self._hub_module.signatures['default']
_tf_text_hub_module_splitter_create_counter.get_cell().increase_by(1)

Expand Down
16 changes: 9 additions & 7 deletions tensorflow_text/python/ops/hub_module_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,20 @@ class HubModuleTokenizer(TokenizerWithOffsets):
Example:
>>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
>>> segmenter = HubModuleTokenizer(HUB_MODULE)
>>> segmenter.tokenize(["新华社北京"])
import tensorflow_hub as hub
HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
segmenter = HubModuleTokenizer(hub.resolve(HUB_MODULE))
segmenter.tokenize(["新华社北京"])
<tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
b'\xe5\x8c\x97\xe4\xba\xac']]>
You can also use this tokenizer to return the split strings and their offsets:
>>> HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
>>> segmenter = HubModuleTokenizer(HUB_MODULE)
>>> pieces, starts, ends = segmenter.tokenize_with_offsets(["新华社北京"])
>>> print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
import tensorflow_hub as hub
HUB_MODULE = "https://tfhub.dev/google/zh_segmentation/1"
segmenter = HubModuleTokenizer(hub.resolve(HUB_MODULE))
pieces, starts, ends = segmenter.tokenize_with_offsets(["新华社北京"])
print("pieces: %s starts: %s ends: %s" % (pieces, starts, ends))
pieces: <tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe',
b'\xe5\x8c\x97\xe4\xba\xac']]>
starts: <tf.RaggedTensor [[0, 9]]>
Expand Down

0 comments on commit e9d1697

Please sign in to comment.