Skip to content

Commit

Permalink
Merge pull request NVIDIA#644 from swethmandava/master
Browse files Browse the repository at this point in the history
Bert tf update (triton v2, fixes)
  • Loading branch information
swethmandava authored Aug 11, 2020
2 parents 09d5235 + 50df68f commit 80af2da
Show file tree
Hide file tree
Showing 17 changed files with 619 additions and 149 deletions.
8 changes: 4 additions & 4 deletions TensorFlow/LanguageModeling/BERT/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@ FROM ${FROM_IMAGE_NAME}
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl4 curl libb64-dev
RUN pip install --upgrade pip
RUN pip install toposort networkx pytest nltk tqdm html2text progressbar
RUN pip --no-cache-dir --no-cache install git+https://github.com/NVIDIA/dllogger
RUN pip --no-cache-dir --no-cache install git+https://github.com/NVIDIA/dllogger wget

WORKDIR /workspace
RUN git clone https://github.com/openai/gradient-checkpointing.git
RUN git clone https://github.com/attardi/wikiextractor.git
RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
RUN git clone https://github.com/soskek/bookcorpus.git
RUN git clone https://github.com/titipata/pubmed_parser


RUN pip3 install /workspace/pubmed_parser

#Copy the perf_client over
ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v1.14.0/v1.14.0_ubuntu1804.clients.tar.gz
ARG TRTIS_CLIENTS_URL=https://github.com/NVIDIA/triton-inference-server/releases/download/v2.0.0/v2.0.0_ubuntu1804.clients.tar.gz
RUN mkdir -p /workspace/install \
&& curl -L ${TRTIS_CLIENTS_URL} | tar xvz -C /workspace/install

#Install the python wheel with pip
RUN pip install /workspace/install/python/tensorrtserver-1.14.0-py3-none-linux_x86_64.whl
RUN pip install /workspace/install/python/triton*.whl

WORKDIR /workspace/bert
COPY . .
Expand Down
4 changes: 2 additions & 2 deletions TensorFlow/LanguageModeling/BERT/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -729,9 +729,9 @@ Note: Time to train includes upto 16 minutes of start up time for every restart

Our results were obtained by running the `scripts/run_squad.sh` training script in the TensorFlow 20.06-py3 NGC container on NVIDIA DGX A100 with 8x A100 40GB GPUs.

| **GPUs** | **Batch size / GPU** | **Accuracy - TF32** | **Accuracy - mixed precision** | **Time to Train - TF32 (Hrs)** | **Time to Train - mixed precision (Hrs)** |
| **GPUs** | **Batch size / GPU: TF32, FP16 ** | **Accuracy - TF32** | **Accuracy - mixed precision** | **Time to Train - TF32 (Hrs)** | **Time to Train - mixed precision (Hrs)** |
|:---:|:----:|:----:|:---:|:----:|:----:|
| 8 | 24 |91.41 |91.52 |0.26|0.26|
| 8 | 16, 24 |91.41 |91.52 |0.26|0.26|

###### Fine-tuning accuracy for GLUE MRPC: NVIDIA DGX A100 (8x A100 40G)

Expand Down
169 changes: 169 additions & 0 deletions TensorFlow/LanguageModeling/BERT/data/ChemProtTextFormatting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import csv
import zipfile
import argparse
import re

class ChemProtTextFormatting:
"""A basic formatter to preprocess the chemprot dataset.
"""

def __init__(self, input_folder, output_folder):

chemprot_folder = input_folder
with zipfile.ZipFile(os.path.join(chemprot_folder, "ChemProt_Corpus.zip"), "r") as zip:
zip.extractall(chemprot_folder)

chemprot_folder = os.path.join(input_folder, "ChemProt_Corpus")

with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_development.zip")) as zip:
zip.extractall(chemprot_folder)

if not os.path.exists(output_folder):
os.makedirs(output_folder)

self.format(os.path.join(chemprot_folder, "chemprot_development"),
"chemprot_development_entities.tsv", "chemprot_development_relations.tsv",
"chemprot_development_abstracts.tsv", os.path.join(output_folder, "dev.tsv"))

with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_test_gs.zip")) as zip:
zip.extractall(chemprot_folder)
self.format(os.path.join(chemprot_folder, "chemprot_test_gs"),
"chemprot_test_entities_gs.tsv", "chemprot_test_relations_gs.tsv",
"chemprot_test_abstracts_gs.tsv", os.path.join(output_folder, "test.tsv"))

with zipfile.ZipFile(os.path.join(chemprot_folder, "chemprot_training.zip")) as zip:
zip.extractall(chemprot_folder)
self.format(os.path.join(chemprot_folder, "chemprot_training"),
"chemprot_training_entities.tsv", "chemprot_training_relations.tsv",
"chemprot_training_abstracts.tsv", os.path.join(output_folder, "train.tsv"))



def format(self, chemprot_path, entity_filename, relations_filename, abstracts_filename, output_filename):
"""
Constructs ChemProt dataset for Relation Extraction.
Args:
chemprot_path: Path to files
entity_filename: Contains labelled mention annotations of chemical compounds and genes/proteins.
<PMID> <EntityNumber> <Type of Entity> <Start Character offset> <End Character Offset> <Text String>
relations_filename: Contains a subset of chemical-protein relations annotations for the Chemprot dataset
<PMID> <CPR Group> <EntityNumber1> <EntityNumber2>
abstracts_filename: Contains plain text CHEMPROT PubMed Data
<PMID> <Title of the Article> <Abstract of the Article>
output_filename: Path to output file that will contain preprocessed data
<PMID.EntityNumber1.EntityNumber2> <Preprocessed Sentence> <CPR Group>
"""

data = {}
train_entities = csv.reader(open(os.path.join(chemprot_path, entity_filename),
mode="r"), delimiter="\t")
for entity in train_entities:
id = entity[0]
if data.get(id, None) is None:
data[id] = {"relations":{}, "entities":{"CHEMICAL":{}, "GENE":{}}}
data[id]["entities"]["CHEMICAL" if entity[2] == "CHEMICAL" else "GENE"][entity[1]] = (int(entity[3]), int(entity[4]), entity[2])

train_relations=csv.reader(open(os.path.join(chemprot_path, relations_filename),
mode="r"), delimiter="\t")
for relation in train_relations:
try:
id = relation[0]
data[id]["relations"][(relation[4].split("Arg1:")[-1], relation[5].split("Arg2:")[-1])] = relation[1] if relation[2] == "Y " else "false"
except:
print("invalid id")
raise ValueError
# print(data[list(data.keys())[0]])

with open(output_filename, 'w') as ofile:
train_abstracts = csv.reader(open(os.path.join(chemprot_path, abstracts_filename),
mode="r"), delimiter="\t")
owriter = csv.writer(ofile, delimiter='\t', lineterminator=os.linesep)
owriter.writerow(["index", "sentence", "label"])

num_sentences = 0
rejected = 0
for abstract in train_abstracts:
id = abstract[0]
line = abstract[1] + "\n" + abstract[2]

for tag1 in data[id]["entities"]["CHEMICAL"].keys():
for tag2 in data[id]["entities"]["GENE"].keys():
tag1_details = data[id]["entities"]["CHEMICAL"][tag1]
tag2_details = data[id]["entities"]["GENE"][tag2]
if ((tag1_details[0] <= tag2_details[0] and tag2_details[0] <= tag1_details[1]) # x1 <= y1 <= x2
or (tag1_details[0] <= tag2_details[1] and tag2_details[0] <= tag1_details[1])): # x1 <= y2 <= x2
continue

relation = data[id]["relations"].get((tag2, tag1), None)
relation = data[id]["relations"].get((tag1, tag2), None) if relation is None else relation
if relation is None:
relation = "false"

start = 0
line_protected = re.sub(r"(.)\.(?=[\d])", r"\1[PROTECTED_DOT]", line)
for sentence in re.split(r'\.|\?', line_protected):
sentence = sentence.replace("[PROTECTED_DOT]", ".")
original_sentence = sentence
end = start + len(sentence)

if (tag1_details[0] >= start and tag1_details[1] <= end) and \
(tag2_details[0] >= start and tag2_details[1] <= end):
for offset_start, offset_end, value in sorted(list(data[id]["entities"]["CHEMICAL"].values()) + list(data[id]["entities"]["GENE"].values()),
reverse=True):
if (offset_start, offset_end) == (tag1_details[0], tag1_details[1]) or (offset_start, offset_end) == (tag2_details[0], tag2_details[1]):
if sentence[offset_start - start] == "@":
offset_end = start + sentence.find('$',offset_start - start) + 1
word = value
elif offset_start < start or offset_end > end or sentence[offset_start - start] == "@":
continue
else:
word = "OTHER"
sentence = sentence[:offset_start-start] + "@" + word + "$" + sentence[offset_end-start:]
sentence = sentence.strip()
owriter.writerow([id+"."+tag1+"."+tag2, sentence, relation])
num_sentences += 1
if id == "23538201" and start == 1048:
print("Accepted", tag1, tag2)

else:
rejected += 1

start = end + 1
print("Succesfully written {} samples to {}".format(num_sentences, output_filename))
print("Rejected are", rejected)


if __name__=="__main__":
parser = argparse.ArgumentParser(
description='Preprocessing Application for ChemProt'
)

parser.add_argument(
'--input_folder',
type=str,
help='Specify the input files in a comma-separated list (no spaces)'
)
parser.add_argument(
'--output_folder',
type=str,
help='Specify the input files in a comma-separated list (no spaces)'
)


args = parser.parse_args()
preprocess_chemprot = ChemProtTextFormatting(args.input_folder, args.output_folder)
19 changes: 11 additions & 8 deletions TensorFlow/LanguageModeling/BERT/data/Downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,15 @@ def download(self):
elif self.dataset_name == 'nvidia_pretrained_weights':
self.download_nvidia_pretrained_weights()

elif self.dataset_name == 'MRPC':
elif self.dataset_name == 'mrpc':
self.download_glue(self.dataset_name)

elif self.dataset_name == 'MNLI':
elif self.dataset_name == 'mnli':
self.download_glue(self.dataset_name)

elif self.dataset_name == 'CoLA':
elif self.dataset_name == 'cola':
self.download_glue(self.dataset_name)
elif self.dataset_name == 'sst-2':
self.download_glue(self.dataset_name)

elif self.dataset_name == 'squad':
Expand All @@ -75,9 +77,10 @@ def download(self):
self.download_pubmed('open_access')
self.download_google_pretrained_weights()
self.download_nvidia_pretrained_weights()
self.download_glue("CoLA")
self.download_glue("MNLI")
self.download_glue("MRPC")
self.download_glue("cola")
self.download_glue("mnli")
self.download_glue("mrpc")
self.download_glue("sst-2")
self.download_squad()

else:
Expand Down Expand Up @@ -111,8 +114,8 @@ def download_nvidia_pretrained_weights(self):


def download_glue(self, glue_task_name):
downloader = GLUEDownloader(glue_task_name, self.save_path)
downloader.download()
downloader = GLUEDownloader(self.save_path)
downloader.download(glue_task_name)


def download_squad(self):
Expand Down
115 changes: 26 additions & 89 deletions TensorFlow/LanguageModeling/BERT/data/GLUEDownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,99 +11,36 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import bz2
import os
import urllib
import sys
import zipfile
import io
import wget

URLLIB=urllib
if sys.version_info >= (3, 0):
URLLIB=urllib.request
from pathlib import Path

class GLUEDownloader:
def __init__(self, task, save_path):

# Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py

self.TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
"MRPC":{"mrpc_dev": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
"mrpc_train": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt',
"mrpc_test": 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'},
"QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
"STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
"MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
"SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
"QNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLI.zip?alt=media&token=c24cad61-f2df-4f04-9ab6-aa576fa829d0',
"RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
"WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
"diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}


self.save_path = save_path
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)

self.task = task

def download(self):
def mkdir(path):
Path(path).mkdir(parents=True, exist_ok=True)

if self.task == 'MRPC':
self.download_mrpc()
elif self.task == 'diagnostic':
self.download_diagnostic()
else:
self.download_and_extract(self.task)

def download_and_extract(self, task):
print("Downloading and extracting %s..." % task)
data_file = "%s.zip" % task
URLLIB.urlretrieve(self.TASK2PATH[task], data_file)
print(data_file,"\n\n\n")
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(self.save_path)
os.remove(data_file)
print("\tCompleted!")

def download_mrpc(self):
print("Processing MRPC...")
mrpc_dir = os.path.join(self.save_path, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)

mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_dev_file = os.path.join(mrpc_dir, "dev_ids.tsv")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")

URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_train"], mrpc_train_file)
URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_test"], mrpc_test_file)
URLLIB.urlretrieve(self.TASK2PATH["MRPC"]["mrpc_dev"], mrpc_dev_file)

dev_ids = []
with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))

with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
class GLUEDownloader:

with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
print("\tCompleted!")
def __init__(self, save_path):
self.save_path = save_path + '/glue'

def download(self, task_name):
mkdir(self.save_path)
if task_name in {'mrpc', 'mnli'}:
task_name = task_name.upper()
elif task_name == 'cola':
task_name = 'CoLA'
else: # SST-2
assert task_name == 'sst-2'
task_name = 'SST'
wget.download(
'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py',
out=self.save_path,
)
sys.path.append(self.save_path)
import download_glue_data
download_glue_data.main(
['--data_dir', self.save_path, '--tasks', task_name])
sys.path.pop()
Loading

0 comments on commit 80af2da

Please sign in to comment.