Merge pull request NVIDIA#644 from swethmandava/master

Bert tf update (triton v2, fixes)
PeganovAnton · Aug 11, 2020 · 80af2da · 80af2da
2 parents 09d5235 + 50df68f
commit 80af2da
Show file tree

Hide file tree

Showing 17 changed files with 619 additions and 149 deletions.
diff --git a/TensorFlow/LanguageModeling/BERT/Dockerfile b/TensorFlow/LanguageModeling/BERT/Dockerfile
@@ -5,24 +5,24 @@ FROM ${FROM_IMAGE_NAME}
 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl4 curl libb64-dev
 RUN pip install --upgrade pip
 RUN pip install toposort networkx pytest nltk tqdm html2text progressbar
-RUN pip --no-cache-dir --no-cache install git+https://github.com/NVIDIA/dllogger
+RUN pip --no-cache-dir --no-cache install git+https://github.com/NVIDIA/dllogger wget
 
 WORKDIR /workspace
 RUN git clone https://github.com/openai/gradient-checkpointing.git
-RUN git clone https://github.com/attardi/wikiextractor.git
+RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
 RUN git clone https://github.com/soskek/bookcorpus.git
 RUN git clone https://github.com/titipata/pubmed_parser
 
 
 RUN pip3 install /workspace/pubmed_parser
 
 #Copy the perf_client over
-ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v1.14.0/v1.14.0_ubuntu1804.clients.tar.gz
+ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v2.0.0/v2.0.0_ubuntu1804.clients.tar.gz
 RUN mkdir -p /workspace/install \
     && curl -L ${TRTIS_CLIENTS_URL} | tar xvz -C /workspace/install
 
 #Install the python wheel with pip
-RUN pip install /workspace/install/python/tensorrtserver-1.14.0-py3-none-linux_x86_64.whl
+RUN pip install /workspace/install/python/triton*.whl
 
 WORKDIR /workspace/bert
 COPY . .

diff --git a/TensorFlow/LanguageModeling/BERT/README.md b/TensorFlow/LanguageModeling/BERT/README.md
@@ -729,9 +729,9 @@ Note: Time to train includes upto 16 minutes of start up time for every restart
 
 Our results were obtained by running the `scripts/run_squad.sh` training script in the TensorFlow 20.06-py3 NGC container on NVIDIA DGX A100 with 8x A100 40GB GPUs.
 
-| **GPUs** | **Batch size / GPU** | **Accuracy - TF32** | **Accuracy - mixed precision** | **Time to Train - TF32 (Hrs)** | **Time to Train - mixed precision (Hrs)** |
+| **GPUs** | **Batch size / GPU: TF32, FP16 ** | **Accuracy - TF32** | **Accuracy - mixed precision** | **Time to Train - TF32 (Hrs)** | **Time to Train - mixed precision (Hrs)** |
 |:---:|:----:|:----:|:---:|:----:|:----:|
-| 8 | 24 |91.41 |91.52 |0.26|0.26|
+| 8 | 16, 24 |91.41 |91.52 |0.26|0.26|
 
 ###### Fine-tuning accuracy for GLUE MRPC: NVIDIA DGX A100 (8x A100 40G)
 

diff --git a/TensorFlow/LanguageModeling/BERT/data/ChemProtTextFormatting.py b/TensorFlow/LanguageModeling/BERT/data/ChemProtTextFormatting.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import csv
+import zipfile
+import argparse
+import re
+
+class ChemProtTextFormatting:
+    """A basic formatter to preprocess the chemprot dataset.
+    """
+
+    def __init__(self, input_folder, output_folder):
+
+        chemprot_folder = input_folder
+        with zipfile.ZipFile(os.path.join(chemprot_folder, "ChemProt_Corpus.zip"), "r") as zip:
+            zip.extractall(chemprot_folder)
+
+        chemprot_folder = os.path.join(input_folder, "ChemProt_Corpus")
+
+        with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_development.zip")) as zip:
+            zip.extractall(chemprot_folder)
+
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+
+        self.format(os.path.join(chemprot_folder, "chemprot_development"),
+                    "chemprot_development_entities.tsv", "chemprot_development_relations.tsv",
+                    "chemprot_development_abstracts.tsv", os.path.join(output_folder, "dev.tsv"))
+
+        with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_test_gs.zip")) as zip:
+            zip.extractall(chemprot_folder)
+        self.format(os.path.join(chemprot_folder, "chemprot_test_gs"),
+                    "chemprot_test_entities_gs.tsv", "chemprot_test_relations_gs.tsv",
+                    "chemprot_test_abstracts_gs.tsv", os.path.join(output_folder, "test.tsv"))
+
+        with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_training.zip")) as zip:
+            zip.extractall(chemprot_folder)
+        self.format(os.path.join(chemprot_folder, "chemprot_training"),
+                    "chemprot_training_entities.tsv", "chemprot_training_relations.tsv",
+                    "chemprot_training_abstracts.tsv", os.path.join(output_folder, "train.tsv"))
+
+
+
+    def format(self, chemprot_path, entity_filename, relations_filename, abstracts_filename, output_filename):
+        """
+        Constructs ChemProt dataset for Relation Extraction.
+
+        Args:
+          chemprot_path: Path to files
+          entity_filename: Contains labelled mention annotations of chemical compounds and genes/proteins.
+                            <PMID> <EntityNumber> <Type of Entity> <Start Character offset> <End Character Offset> <Text String>
+          relations_filename: Contains a subset of chemical-protein relations annotations for the Chemprot dataset
+                            <PMID> <CPR Group> <EntityNumber1> <EntityNumber2>
+          abstracts_filename: Contains plain text CHEMPROT PubMed Data
+                            <PMID> <Title of the Article> <Abstract of the Article>
+          output_filename: Path to output file that will contain preprocessed data
+                            <PMID.EntityNumber1.EntityNumber2> <Preprocessed Sentence> <CPR Group>
+        """
+
+        data = {}
+        train_entities = csv.reader(open(os.path.join(chemprot_path, entity_filename),
+                                         mode="r"), delimiter="\t")
+        for entity in train_entities:
+            id = entity[0]
+            if data.get(id, None) is None:
+                data[id] = {"relations":{}, "entities":{"CHEMICAL":{}, "GENE":{}}}
+            data[id]["entities"]["CHEMICAL" if entity[2] == "CHEMICAL" else "GENE"][entity[1]] = (int(entity[3]), int(entity[4]), entity[2])
+
+        train_relations=csv.reader(open(os.path.join(chemprot_path, relations_filename),
+                                   mode="r"), delimiter="\t")
+        for relation in train_relations:
+            try:
+                id = relation[0]
+                data[id]["relations"][(relation[4].split("Arg1:")[-1], relation[5].split("Arg2:")[-1])] = relation[1] if relation[2] == "Y " else "false"
+            except:
+                print("invalid id")
+                raise ValueError
+        # print(data[list(data.keys())[0]])
+
+        with open(output_filename, 'w') as ofile:
+            train_abstracts = csv.reader(open(os.path.join(chemprot_path, abstracts_filename),
+                                              mode="r"), delimiter="\t")
+            owriter = csv.writer(ofile, delimiter='\t', lineterminator=os.linesep)
+            owriter.writerow(["index", "sentence", "label"])
+
+            num_sentences = 0
+            rejected = 0
+            for abstract in train_abstracts:
+                id = abstract[0]
+                line = abstract[1] + "\n" + abstract[2]
+
+                for tag1 in data[id]["entities"]["CHEMICAL"].keys():
+                    for tag2 in data[id]["entities"]["GENE"].keys():
+                        tag1_details = data[id]["entities"]["CHEMICAL"][tag1]
+                        tag2_details = data[id]["entities"]["GENE"][tag2]
+                        if ((tag1_details[0] <= tag2_details[0] and tag2_details[0] <= tag1_details[1]) # x1 <= y1 <= x2
+                            or (tag1_details[0] <= tag2_details[1] and tag2_details[0] <= tag1_details[1])): # x1 <= y2 <= x2
+                            continue
+
+                        relation = data[id]["relations"].get((tag2, tag1), None)
+                        relation = data[id]["relations"].get((tag1, tag2), None) if relation is None else relation
+                        if relation is None:
+                            relation = "false"
+
+                        start = 0
+                        line_protected = re.sub(r"(.)\.(?=[\d])", r"\1[PROTECTED_DOT]", line)
+                        for sentence in re.split(r'\.|\?', line_protected):
+                            sentence = sentence.replace("[PROTECTED_DOT]", ".")
+                            original_sentence = sentence
+                            end = start + len(sentence)
+
+                            if (tag1_details[0] >= start and tag1_details[1] <= end) and \
+                                    (tag2_details[0] >= start and tag2_details[1] <= end):
+                                for offset_start, offset_end, value in sorted(list(data[id]["entities"]["CHEMICAL"].values()) + list(data[id]["entities"]["GENE"].values()),
+                                                         reverse=True):
+                                    if (offset_start, offset_end) == (tag1_details[0], tag1_details[1]) or (offset_start, offset_end) == (tag2_details[0], tag2_details[1]):
+                                        if sentence[offset_start - start] == "@":
+                                            offset_end = start + sentence.find('$',offset_start - start) + 1
+                                        word = value
+                                    elif offset_start < start or offset_end > end or sentence[offset_start - start] == "@":
+                                        continue
+                                    else:
+                                        word = "OTHER"
+                                    sentence = sentence[:offset_start-start] + "@" + word + "$" + sentence[offset_end-start:]
+                                sentence = sentence.strip()
+                                owriter.writerow([id+"."+tag1+"."+tag2, sentence, relation])
+                                num_sentences += 1
+                                if id == "23538201" and start == 1048:
+                                    print("Accepted", tag1, tag2)
+
+                            else:
+                                rejected += 1
+
+                            start = end + 1
+            print("Succesfully written {} samples to {}".format(num_sentences, output_filename))
+            print("Rejected are", rejected)
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(
+        description='Preprocessing Application for ChemProt'
+    )
+
+    parser.add_argument(
+        '--input_folder',
+        type=str,
+        help='Specify the input files in a comma-separated list (no spaces)'
+    )
+    parser.add_argument(
+        '--output_folder',
+        type=str,
+        help='Specify the input files in a comma-separated list (no spaces)'
+    )
+
+
+    args = parser.parse_args()
+    preprocess_chemprot = ChemProtTextFormatting(args.input_folder, args.output_folder)
diff --git a/TensorFlow/LanguageModeling/BERT/data/Downloader.py b/TensorFlow/LanguageModeling/BERT/data/Downloader.py
@@ -53,13 +53,15 @@ def download(self):
         elif self.dataset_name == 'nvidia_pretrained_weights':
             self.download_nvidia_pretrained_weights()
 
-        elif self.dataset_name == 'MRPC':
+        elif self.dataset_name == 'mrpc':
             self.download_glue(self.dataset_name)
 
-        elif self.dataset_name == 'MNLI':
+        elif self.dataset_name == 'mnli':
             self.download_glue(self.dataset_name)
 
-        elif self.dataset_name == 'CoLA':
+        elif self.dataset_name == 'cola':
+            self.download_glue(self.dataset_name)
+        elif self.dataset_name == 'sst-2':
             self.download_glue(self.dataset_name)
 
         elif self.dataset_name == 'squad':
@@ -75,9 +77,10 @@ def download(self):
             self.download_pubmed('open_access')
             self.download_google_pretrained_weights()
             self.download_nvidia_pretrained_weights()
-            self.download_glue("CoLA")
-            self.download_glue("MNLI")
-            self.download_glue("MRPC")
+            self.download_glue("cola")
+            self.download_glue("mnli")
+            self.download_glue("mrpc")
+            self.download_glue("sst-2")
             self.download_squad()
 
         else:
@@ -111,8 +114,8 @@ def download_nvidia_pretrained_weights(self):
 
 
     def download_glue(self, glue_task_name):
-        downloader = GLUEDownloader(glue_task_name, self.save_path)
-        downloader.download()
+        downloader = GLUEDownloader(self.save_path)
+        downloader.download(glue_task_name)
 
 
     def download_squad(self):

diff --git a/TensorFlow/LanguageModeling/BERT/data/GLUEDownloader.py b/TensorFlow/LanguageModeling/BERT/data/GLUEDownloader.py
@@ -11,99 +11,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import bz2
-import os
-import urllib
 import sys
-import zipfile
-import io
+import wget
 
-URLLIB=urllib
-if sys.version_info >= (3, 0):
-    URLLIB=urllib.request
+from pathlib import Path
 
-class GLUEDownloader:
-    def __init__(self, task, save_path):
-
-        # Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
-
-        self.TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
-                     "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
-                     "MRPC":{"mrpc_dev": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
-                            "mrpc_train": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt',
-                            "mrpc_test": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'},
-                     "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
-                     "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
-                     "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
-                     "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
-                     "QNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLI.zip?alt=media&token=c24cad61-f2df-4f04-9ab6-aa576fa829d0',
-                     "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
-                     "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
-                     "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
-
-
-        self.save_path = save_path
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        self.task = task
 
-    def download(self):
+def mkdir(path):
+    Path(path).mkdir(parents=True, exist_ok=True)
 
-        if self.task == 'MRPC':
-            self.download_mrpc()
-        elif self.task == 'diagnostic':
-            self.download_diagnostic()
-        else:
-            self.download_and_extract(self.task)
 
-    def download_and_extract(self, task):
-        print("Downloading and extracting %s..." % task)
-        data_file = "%s.zip" % task
-        URLLIB.urlretrieve(self.TASK2PATH[task], data_file)
-        print(data_file,"\n\n\n")
-        with zipfile.ZipFile(data_file) as zip_ref:
-            zip_ref.extractall(self.save_path)
-        os.remove(data_file)
-        print("\tCompleted!")
-
-    def download_mrpc(self):
-        print("Processing MRPC...")
-        mrpc_dir = os.path.join(self.save_path, "MRPC")
-        if not os.path.isdir(mrpc_dir):
-            os.mkdir(mrpc_dir)
-
-        mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
-        mrpc_dev_file = os.path.join(mrpc_dir, "dev_ids.tsv")
-        mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
-
-        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_train"], mrpc_train_file)
-        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_test"], mrpc_test_file)
-        URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_dev"], mrpc_dev_file)
-
-        dev_ids = []
-        with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
-            for row in ids_fh:
-                dev_ids.append(row.strip().split('\t'))
-
-        with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
-                io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
-                io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
-            header = data_fh.readline()
-            train_fh.write(header)
-            dev_fh.write(header)
-            for row in data_fh:
-                label, id1, id2, s1, s2 = row.strip().split('\t')
-                if [id1, id2] in dev_ids:
-                    dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
-                else:
-                    train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+class GLUEDownloader:
 
-        with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
-                io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
-            header = data_fh.readline()
-            test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
-            for idx, row in enumerate(data_fh):
-                label, id1, id2, s1, s2 = row.strip().split('\t')
-                test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
-        print("\tCompleted!")
+    def __init__(self, save_path):
+        self.save_path = save_path + '/glue'
+
+    def download(self, task_name):
+        mkdir(self.save_path)
+        if task_name in {'mrpc', 'mnli'}:
+            task_name = task_name.upper()
+        elif task_name == 'cola':
+            task_name = 'CoLA'
+        else:  # SST-2
+            assert task_name == 'sst-2'
+            task_name = 'SST'
+        wget.download(
+            'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py',
+            out=self.save_path,
+        )
+        sys.path.append(self.save_path)
+        import download_glue_data
+        download_glue_data.main(
+            ['--data_dir', self.save_path, '--tasks', task_name])
+        sys.path.pop()