jezeniel · edgaru · Sep 21, 2020
diff --git a/README.rst b/README.rst
@@ -63,6 +63,12 @@ Example for encoding and decoding:
   sms.parts[0].length  # 7
   sms.parts[0].bytes  # 16
 
+  smsutil.text_to_gsm('|最Som€高でした 	Text	🍔!') # |Som€ Text!
+  smsutil.text_to_gsm('|最Som€高でした 	Text	🍔!', True) # Som Text!
+
+  smsutil.count_non_gsm_characters('|最Som€高でした 	Text	🍔!') # 7
+  smsutil.count_non_gsm_characters('|最Som€高でした 	Text	🍔!', True) # 5
+
 smsutil is just using python's builtin codecs for UCS2/UTF-16.
 
 

diff --git a/smsutil/codecs.py b/smsutil/codecs.py
@@ -6,7 +6,6 @@
 
 from builtins import bytes
 
-
 GSM_BASIC_CHARSET = (
     u'@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞ\x1bÆæßÉ !"#¤%&\'()*+,-./0123456789:;<=>?¡'
     u'ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑÜ§¿abcdefghijklmnopqrstuvwxyzäöñüà')
@@ -100,4 +99,16 @@ def is_valid_gsm(text):
     return re.match(r, text, re.UNICODE) is not None
 
 
+def text_to_gsm(text, basic_gsm=False):
+    ''' Remove all non-gsm 03.338 characters '''
+    charset = GSM_CHARSET if not basic_gsm else GSM_BASIC_CHARSET
+    return re.sub(u'[^' + re.escape(charset) + ']+', '', text, re.UNICODE)
+
+
+def count_non_gsm_characters(text, basic_gsm=False):
+    ''' Count all non-gsm 03.338 characters '''
+    charset = GSM_CHARSET if not basic_gsm else GSM_BASIC_CHARSET
+    return len(re.findall(u'[' + re.escape(charset) + ']', text, re.UNICODE))
+
+
 codecs.register(search_gsm0338)
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
@@ -3,7 +3,7 @@
 
 import unittest
 
-from smsutil.codecs import GSM_CHARSET, is_valid_gsm
+from smsutil.codecs import GSM_CHARSET, is_valid_gsm, text_to_gsm, count_non_gsm_characters
 
 
 class TestGSMValidator:
@@ -16,6 +16,16 @@ def test_mixed(self):
     def test_invalid_characters(self):
         assert not is_valid_gsm('the quick brown こんにちは')
 
+    def test_text_to_gsm(self):
+        test_string = '|最Som€高でした 	Text	🍔!'
+        assert text_to_gsm(test_string) == "|Som€ Text!"
+        assert text_to_gsm(test_string, True) == "Som Text!"
+
+    def count_non_gsm_characters(self):
+        test_string = '|最Som€高でした 	Text	🍔!'
+        assert count_non_gsm_characters(test_string) == 7
+        assert count_non_gsm_characters(test_string, True) == 5
+
 
 class TestCoding(unittest.TestCase):
     def test_roundtrip(self):