diff --git a/README.md b/README.md
index f1257cc..60362bd 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ and test data. Note that the official CoNLL 2003 evaluation script requires Perl
 
 You can then begin training with the following command:
 ```bash
-python tfnlp.trainer.py --job-dir data/experiments/conll-03 \
+python tfnlp/trainer.py --job-dir data/experiments/conll-03 \
 --train path/to/conll03/eng.train \
 --valid path/to/conll03/eng.testa \
 --test path/to/conll03/eng.testb \
@@ -102,7 +102,7 @@ and optional test files `test-wsj.conll` and `test-brown.conll`.
 
 You can then begin training using the following command:
 ```bash
-python tfnlp.trainer.py --job-dir data/experiments/conll-05 \
+python tfnlp/trainer.py --job-dir data/experiments/conll-05 \
 --train path/to/conll05/train-set.conll \
 --valid path/to/conll05/dev-set.conll \
 --test path/to/conll05/test-wsj.conll \
@@ -130,7 +130,7 @@ into the Stanford Dependency format.
 To train on the English CoNLL-2009 dependency data (using provided predicted POS tags), 
 you can use the following command:
 ```bash
-python tfnlp.trainer.py --job-dir data/experiments/conll-09-en \
+python tfnlp/trainer.py --job-dir data/experiments/conll-09-en \
 --train path/to/CoNLL2009-ST-English/CoNLL2009-ST-English-train.txt \
 --valid path/to/CoNLL2009-ST-English-development.txt \
 --config data/config/parsing/parser-config.json \
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index fe36be4..a608480 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -1,11 +1,8 @@
-tensorflow==1.15.2
-tensorflow_hub==0.7.0
-tensor2tensor==1.15.5
-tensorflow-probability==0.8.0
-albert-tensorflow
-bert-tensorflow
+tensorflow==1.15.*
+tensorflow_hub==0.10.0
+tensorflow-probability==0.7.0
+albert-tensorflow==1.1
+bert-tensorflow==1.0.1
 sentencepiece
 sklearn
-numpy
-nltk
-six>=1.13.0
\ No newline at end of file
+nltk
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index c139bff..ce81108 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,8 @@
-tensorflow-gpu==1.15.2
-tensorflow_hub==0.7.0
-tensor2tensor==1.15.5
-tensorflow-probability==0.8.0
-albert-tensorflow
-bert-tensorflow
+tensorflow-gpu==1.15.*
+tensorflow_hub==0.10.0
+tensorflow-probability==0.7.0
+albert-tensorflow==1.1
+bert-tensorflow==1.0.1
 sentencepiece
 sklearn
-numpy
-nltk
-six>=1.13.0
\ No newline at end of file
+nltk
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 5bf627b..9adfb90 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,10 @@
 from setuptools import find_packages, setup
 
 REQUIRED_PACKAGES = [
-    "tensorflow-gpu==1.15.0",
-    "tensorflow-hub==0.7.0",
-    "tensor2tensor==1.15.5",
+    "tensorflow-gpu==1.15.*",
+    "tensorflow-hub==0.10.0",
     "bert-tensorflow==1.0.1",
-    "tensorflow-probability==0.8.0",
+    "tensorflow-probability==0.7.0",
     "numpy>=1.14.2",
     "nltk>=3.2.5",
 ]
diff --git a/tfnlp/cli/evaluators.py b/tfnlp/cli/evaluators.py
index 6ef458e..07bc98b 100644
--- a/tfnlp/cli/evaluators.py
+++ b/tfnlp/cli/evaluators.py
@@ -189,8 +189,10 @@ def start(self):
         self.indices = []
 
     def accumulate(self, instance, result):
-        self.labels.append([label for label in result[self.target_key] if label != BERT_SUBLABEL])
-        self.gold.append([label for label in instance[self.labels_key] if label != BERT_SUBLABEL])
+        gold = [label for label in instance[self.labels_key] if label != BERT_SUBLABEL]
+        predicted = [label for label in result[self.target_key] if label != BERT_SUBLABEL][:len(gold)]
+        self.labels.append(predicted)
+        self.gold.append(gold)
         self.indices.append(instance[constants.SENTENCE_INDEX])
 
     def evaluate(self, identifier=None):
@@ -342,7 +344,10 @@ def start(self):
 
     def accumulate(self, instance, result):
         super().accumulate(instance, result)
-        self.markers.append(instance[constants.MARKER_KEY])
+        idx = instance[constants.MARKER_KEY]
+        if isinstance(idx, list):
+            idx = idx.index('1')
+        self.markers.append(idx)
 
     def evaluate(self, identifier='.'):
         write_props_to_file(self.output_path + '.gold.txt', self.gold, self.markers, self.indices)
diff --git a/tfnlp/layers/layers.py b/tfnlp/layers/layers.py
index 464e1ea..6b123f1 100644
--- a/tfnlp/layers/layers.py
+++ b/tfnlp/layers/layers.py
@@ -2,7 +2,7 @@
 import tensorflow as tf
 import tensorflow_estimator as tfe
 import tensorflow_hub as hub
-from tensor2tensor.layers.common_attention import add_timing_signal_1d, attention_bias_ignore_padding, multihead_attention
+
 from tensorflow.compat.v1 import get_variable
 from tensorflow.compat.v1 import logging
 from tensorflow.compat.v1 import variable_scope
@@ -21,6 +21,7 @@
 from tensorflow.python.ops.rnn_cell_impl import DropoutWrapper, LSTMStateTuple, LayerRNNCell
 
 from tfnlp.common import constants
+from tfnlp.layers.transformers import add_timing_signal_1d, attention_bias_ignore_padding, multihead_attention
 
 ELMO_URL = "https://tfhub.dev/google/elmo/2"
 
@@ -546,14 +547,13 @@ def _residual(_x, _y):
             x = _layer_norm(inputs)
 
             # multi-head self-attention
-            y = multihead_attention(query_antecedent=x, memory_antecedent=None,
+            y = multihead_attention(query_antecedent=x,
                                     bias=attention_bias,
                                     total_key_depth=self_attention_dim,
                                     total_value_depth=self_attention_dim,
                                     output_depth=self_attention_dim,
                                     num_heads=config.num_heads,
-                                    dropout_rate=config.attention_dropout if training else 0,
-                                    attention_type="dot_product")
+                                    dropout_rate=config.attention_dropout if training else 0)
             x = _residual(x, y)
 
         with variable_scope("ffnn"):
diff --git a/tfnlp/layers/transformers.py b/tfnlp/layers/transformers.py
new file mode 100644
index 0000000..a86cd41
--- /dev/null
+++ b/tfnlp/layers/transformers.py
@@ -0,0 +1,362 @@
+# Copyright 2019 The Tensor2Tensor Authors.
+#
+# Modifications Copyright 2020 James Gung.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# # limitations under the License.
+
+import math
+
+import tensorflow as tf
+
+
+def cast_like(x, y):
+    """Cast x to y's dtype, if necessary."""
+    x = tf.convert_to_tensor(x)
+    y = tf.convert_to_tensor(y)
+
+    if x.dtype.base_dtype == y.dtype.base_dtype:
+        return x
+
+    cast_x = tf.cast(x, y.dtype)
+    if cast_x.device != x.device:
+        x_name = "(eager Tensor)"
+        try:
+            x_name = x.name
+        except AttributeError:
+            pass
+        tf.logging.warning("Cast for %s may induce copy from '%s' to '%s'", x_name,
+                           x.device, cast_x.device)
+    return cast_x
+
+
+def shape_list(x):
+    """Return list of dims, statically where possible."""
+    x = tf.convert_to_tensor(x)
+
+    # If unknown rank, return dynamic shape
+    if x.get_shape().dims is None:
+        return tf.shape(x)
+
+    static = x.get_shape().as_list()
+    shape = tf.shape(x)
+
+    ret = []
+    for i, dim in enumerate(static):
+        if dim is None:
+            dim = shape[i]
+        ret.append(dim)
+    return ret
+
+
+def large_compatible_negative(tensor_type):
+    """Large negative number as Tensor.
+
+    This function is necessary because the standard value for epsilon
+    in this module (-1e9) cannot be represented using tf.float16
+
+    Args:
+      tensor_type: a dtype to determine the type.
+
+    Returns:
+      a large negative number.
+    """
+    if tensor_type == tf.float16:
+        return tf.float16.min
+    return -1e9
+
+
+def add_timing_signal_1d(x,
+                         min_timescale=1.0,
+                         max_timescale=1.0e4,
+                         start_index=0):
+    """Adds a bunch of sinusoids of different frequencies to a Tensor.
+
+    Each channel of the input Tensor is incremented by a sinusoid of a different
+    frequency and phase.
+
+    This allows attention to learn to use absolute and relative positions.
+    Timing signals should be added to some precursors of both the query and the
+    memory inputs to attention.
+
+    The use of relative position is possible because sin(x+y) and cos(x+y) can be
+    expressed in terms of y, sin(x) and cos(x).
+
+    In particular, we use a geometric sequence of timescales starting with
+    min_timescale and ending with max_timescale.  The number of different
+    timescales is equal to channels / 2. For each timescale, we
+    generate the two sinusoidal signals sin(timestep/timescale) and
+    cos(timestep/timescale).  All of these sinusoids are concatenated in
+    the channels dimension.
+
+    Args:
+      x: a Tensor with shape [batch, length, channels]
+      min_timescale: a float
+      max_timescale: a float
+      start_index: index of first position
+
+    Returns:
+      a Tensor the same shape as x.
+    """
+    length = shape_list(x)[1]
+    channels = shape_list(x)[2]
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale, start_index)
+    return x + cast_like(signal, x)
+
+
+def get_timing_signal_1d(length,
+                         channels,
+                         min_timescale=1.0,
+                         max_timescale=1.0e4,
+                         start_index=0):
+    """Gets a bunch of sinusoids of different frequencies.
+
+    Each channel of the input Tensor is incremented by a sinusoid of a different
+    frequency and phase.
+
+    This allows attention to learn to use absolute and relative positions.
+    Timing signals should be added to some precursors of both the query and the
+    memory inputs to attention.
+
+    The use of relative position is possible because sin(x+y) and cos(x+y) can be
+    expressed in terms of y, sin(x) and cos(x).
+
+    In particular, we use a geometric sequence of timescales starting with
+    min_timescale and ending with max_timescale.  The number of different
+    timescales is equal to channels / 2. For each timescale, we
+    generate the two sinusoidal signals sin(timestep/timescale) and
+    cos(timestep/timescale).  All of these sinusoids are concatenated in
+    the channels dimension.
+
+    Args:
+      length: scalar, length of timing signal sequence.
+      channels: scalar, size of timing embeddings to create. The number of
+          different timescales is equal to channels / 2.
+      min_timescale: a float
+      max_timescale: a float
+      start_index: index of first position
+
+    Returns:
+      a Tensor of timing signals [1, length, channels]
+    """
+    position = tf.to_float(tf.range(length) + start_index)
+    num_timescales = channels // 2
+    log_timescale_increment = (
+            math.log(float(max_timescale) / float(min_timescale)) /
+            tf.maximum(tf.to_float(num_timescales) - 1, 1))
+    inv_timescales = min_timescale * tf.exp(
+        tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
+    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
+    # Please note that this slightly differs from the published paper.
+    # See a discussion here: https://github.com/tensorflow/tensor2tensor/pull/177
+    signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+    signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
+    signal = tf.reshape(signal, [1, length, channels])
+    return signal
+
+
+def attention_bias_ignore_padding(memory_padding):
+    """Create an bias tensor to be added to attention logits.
+
+    Args:
+      memory_padding: a float `Tensor` with shape [batch, memory_length].
+
+    Returns:
+      a `Tensor` with shape [batch, 1, 1, memory_length].
+    """
+    ret = memory_padding * large_compatible_negative(memory_padding.dtype)
+    return tf.expand_dims(tf.expand_dims(ret, axis=1), axis=1)
+
+
+def dot_product_attention(q,
+                          k,
+                          v,
+                          bias,
+                          dropout_rate=0.0,
+                          name="dot_product_attention"):
+    """Dot-product attention.
+
+    Args:
+      q: Tensor with shape [..., length_q, depth_k].
+      k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must
+        match with q.
+      v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must
+        match with q.
+      bias: bias Tensor (see attention_bias())
+      dropout_rate: a float.
+      name: an optional string
+
+    Returns:
+      Tensor with shape [..., length_q, depth_v].
+    """
+    with tf.variable_scope(name, values=[q, k, v]):
+        logits = tf.matmul(q, k, transpose_b=True)  # [..., length_q, length_kv]
+        if bias is not None:
+            bias = cast_like(bias, logits)
+            logits += bias
+        # If logits are fp16, upcast before softmax
+        weights = tf.nn.softmax(logits, name="attention_weights")
+        weights = cast_like(weights, q)
+        # Drop out attention links for each head.
+        weights = tf.nn.dropout(weights, keep_prob=1 - dropout_rate)
+        return tf.matmul(weights, v)
+
+
+def combine_last_two_dimensions(x):
+    """Reshape x so that the last two dimension become one.
+
+    Args:
+      x: a Tensor with shape [..., a, b]
+
+    Returns:
+      a Tensor with shape [..., ab]
+    """
+    x_shape = shape_list(x)
+    a, b = x_shape[-2:]
+    return tf.reshape(x, x_shape[:-2] + [a * b])
+
+
+def combine_heads(x):
+    """Inverse of split_heads.
+
+    Args:
+      x: a Tensor with shape [batch, num_heads, length, channels / num_heads]
+
+    Returns:
+      a Tensor with shape [batch, length, channels]
+    """
+    return combine_last_two_dimensions(tf.transpose(x, [0, 2, 1, 3]))
+
+
+def split_last_dimension(x, n):
+    """Reshape x so that the last dimension becomes two dimensions.
+
+    The first of these two dimensions is n.
+
+    Args:
+      x: a Tensor with shape [..., m]
+      n: an integer.
+
+    Returns:
+      a Tensor with shape [..., n, m/n]
+    """
+    x_shape = shape_list(x)
+    m = x_shape[-1]
+    if isinstance(m, int) and isinstance(n, int):
+        assert m % n == 0
+    return tf.reshape(x, x_shape[:-1] + [n, m // n])
+
+
+def split_heads(x, num_heads):
+    """Split channels (dimension 2) into multiple heads (becomes dimension 1).
+
+    Args:
+      x: a Tensor with shape [batch, length, channels]
+      num_heads: an integer
+
+    Returns:
+      a Tensor with shape [batch, num_heads, length, channels / num_heads]
+    """
+    return tf.transpose(split_last_dimension(x, num_heads), [0, 2, 1, 3])
+
+
+def compute_attention_component(antecedent,
+                                total_depth,
+                                name):
+    """Computes attention component (query, key or value).
+
+    Args:
+      antecedent: a Tensor with shape [batch, length, channels]
+      total_depth: an integer
+      name: a string specifying scope name.
+
+    Returns:
+      c : [batch, length, depth] tensor
+    """
+    return tf.layers.dense(antecedent, total_depth, use_bias=False, name=name)
+
+
+def compute_qkv(query_antecedent,
+                total_key_depth,
+                total_value_depth):
+    """Computes query, key and value.
+
+    Args:
+      query_antecedent: a Tensor with shape [batch, length_q, channels]
+      total_key_depth: an integer
+      total_value_depth: an integer
+
+    Returns:
+      q, k, v : [batch, length, depth] tensors
+    """
+    q = compute_attention_component(query_antecedent, total_key_depth, "q")
+    k = compute_attention_component(query_antecedent, total_key_depth, "k")
+    v = compute_attention_component(query_antecedent, total_value_depth, "v")
+    return q, k, v
+
+
+def multihead_attention(query_antecedent,
+                        bias,
+                        total_key_depth,
+                        total_value_depth,
+                        output_depth,
+                        num_heads,
+                        dropout_rate,
+                        name="multihead_attention"):
+    """Multihead scaled-dot-product attention with input/output transformations.
+
+    Args:
+      query_antecedent: a Tensor with shape [batch, length_q, channels]
+      bias: bias Tensor (see attention_bias())
+      total_key_depth: an integer
+      total_value_depth: an integer
+      output_depth: an integer
+      num_heads: an integer dividing total_key_depth and total_value_depth
+      dropout_rate: a floating point number
+      name: an optional string.
+
+    Returns:
+      The result of the attention transformation. The output shape is
+          [batch_size, length_q, hidden_dim]
+
+    Raises:
+      ValueError: if the key depth or value depth are not divisible by the
+        number of attention heads.
+    """
+    if total_key_depth % num_heads != 0:
+        raise ValueError("Key depth (%d) must be divisible by the number of "
+                         "attention heads (%d)." % (total_key_depth, num_heads))
+    if total_value_depth % num_heads != 0:
+        raise ValueError("Value depth (%d) must be divisible by the number of "
+                         "attention heads (%d)." % (total_value_depth, num_heads))
+
+    with tf.variable_scope(name, values=[query_antecedent]):
+
+        q, k, v = compute_qkv(query_antecedent, total_key_depth, total_value_depth)
+
+        q = split_heads(q, num_heads)
+        k = split_heads(k, num_heads)
+        v = split_heads(v, num_heads)
+
+        key_depth_per_head = total_key_depth // num_heads
+        q *= key_depth_per_head ** -0.5
+
+        x = dot_product_attention(q, k, v, bias, dropout_rate)
+
+        x = combine_heads(x)
+
+        # Set last dim specifically.
+        x.set_shape(x.shape.as_list()[:-1] + [total_value_depth])
+
+        x = tf.layers.dense(x, output_depth, use_bias=False, name="output_transform")
+
+        return x
diff --git a/tfnlp/predictor.py b/tfnlp/predictor.py
index 7559963..f43b0de 100644
--- a/tfnlp/predictor.py
+++ b/tfnlp/predictor.py
@@ -133,14 +133,14 @@ def from_config_and_savedmodel(path_to_config: str, path_to_savedmodel: str, pat
     tf.logging.info("Loading predictor from saved model at %s" % path_to_savedmodel)
     tf_predictor = _default_predictor(path_to_savedmodel)
     parser_function = get_parser(config)
-    feature_function = _get_feature_function(config.features, path_to_vocab)
+    feature_function = _get_feature_function(config.features, config.heads, path_to_vocab)
     formatter = get_formatter(config)
 
     return Predictor(tf_predictor, parser_function, feature_function, formatter, default_batching_function(config.batch_size))
 
 
-def _get_feature_function(config: object, path_to_vocab: str) -> Callable[[dict], str]:
-    feature_extractor = get_feature_extractor(config)
+def _get_feature_function(feature_config: object, heads_config, path_to_vocab: str) -> Callable[[dict], str]:
+    feature_extractor = get_feature_extractor(feature_config, heads=heads_config)
     feature_extractor.read_vocab(path_to_vocab)
 
     return lambda instance: feature_extractor.extract(instance, train=False).SerializeToString()