kaushu42 · kaushu42 · Nov 23, 2020 · Oct 22, 2020 · Oct 22, 2020 · Oct 22, 2020
diff --git a/Nepali_nlp/Nepali_tokenizer.py b/Nepali_nlp/Nepali_tokenizer.py
@@ -1,9 +1,14 @@
+import os
+import sys 
+sys.path.append('..')
 import string
 
+import tensorflow as tf
+import sentencepiece as spm
 
 class Tokenizer:
     def __init__(self):
-        pass
+        self.this_dir, self.this_file = os.path.split(__file__)
 
     def sentence_tokenize(self, text):
         """This function tokenize the sentences
@@ -14,7 +19,7 @@ def sentence_tokenize(self, text):
         Returns:
             sentence {list} -- tokenized sentence in list
         """
-        sentences = text.split(u"।")
+        sentences = text.strip().split(u"।")
         sentences = [sentence.translate(str.maketrans('', '', string.punctuation)) for sentence in sentences]
         return sentences
 
@@ -63,5 +68,23 @@ def character_tokenize(self, word):
 
         return char
 
+    def sentencepeice_tokenize(self, text):
+        """unsupervised way of tokenizing the text using google sentencepiece library. More info at https://github.com/google/sentencepiece
+
+        Args:
+            text (string): Text in Nepali language
+
+        Returns:
+            list: tokenized words.
+        """
+        try:
+            model = tf.gfile.Gfile(os.path.join(self.this_dir, "local_dataset", "m_bpe.model"), "rb").read() #tf version 1
+        except:
+            model = tf.io.gfile.GFile(os.path.join(self.this_dir, "local_dataset", "m_bpe.model"), "rb").read() #tf version 2
+        sp = spm.SentencePieceProcessor()
+        sp.load_from_serialized_proto(model)
+        return sp.encode_as_pieces(text)
+
+
     def __str__(self):
         return "Helps to tokenize content written in Nepali language."
diff --git a/Nepali_nlp/local_dataset/m_bpe.model b/Nepali_nlp/local_dataset/m_bpe.model