Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added text_to_gsm and count_non_gsm_characters functions. #11

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ Example for encoding and decoding:
sms.parts[0].length # 7
sms.parts[0].bytes # 16

smsutil.text_to_gsm('|最Som€高でした Text 🍔!') # |Som€ Text!
smsutil.text_to_gsm('|最Som€高でした Text 🍔!', True) # Som Text!

smsutil.count_non_gsm_characters('|最Som€高でした Text 🍔!') # 7
smsutil.count_non_gsm_characters('|最Som€高でした Text 🍔!', True) # 5

smsutil is just using python's builtin codecs for UCS2/UTF-16.


Expand Down
13 changes: 12 additions & 1 deletion smsutil/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from builtins import bytes


GSM_BASIC_CHARSET = (
u'@£$¥èéùìòÇ\nØø\rÅåΔ_ΦΓΛΩΠΨΣΘΞ\x1bÆæßÉ !"#¤%&\'()*+,-./0123456789:;<=>?¡'
u'ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÑܧ¿abcdefghijklmnopqrstuvwxyzäöñüà')
Expand Down Expand Up @@ -100,4 +99,16 @@ def is_valid_gsm(text):
return re.match(r, text, re.UNICODE) is not None


def text_to_gsm(text, basic_gsm=False):
''' Remove all non-gsm 03.338 characters '''
charset = GSM_CHARSET if not basic_gsm else GSM_BASIC_CHARSET
return re.sub(u'[^' + re.escape(charset) + ']+', '', text, re.UNICODE)


def count_non_gsm_characters(text, basic_gsm=False):
''' Count all non-gsm 03.338 characters '''
charset = GSM_CHARSET if not basic_gsm else GSM_BASIC_CHARSET
return len(re.findall(u'[' + re.escape(charset) + ']', text, re.UNICODE))


codecs.register(search_gsm0338)
12 changes: 11 additions & 1 deletion tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import unittest

from smsutil.codecs import GSM_CHARSET, is_valid_gsm
from smsutil.codecs import GSM_CHARSET, is_valid_gsm, text_to_gsm, count_non_gsm_characters


class TestGSMValidator:
Expand All @@ -16,6 +16,16 @@ def test_mixed(self):
def test_invalid_characters(self):
assert not is_valid_gsm('the quick brown こんにちは')

def test_text_to_gsm(self):
test_string = '|最Som€高でした Text 🍔!'
assert text_to_gsm(test_string) == "|Som€ Text!"
assert text_to_gsm(test_string, True) == "Som Text!"

def count_non_gsm_characters(self):
test_string = '|最Som€高でした Text 🍔!'
assert count_non_gsm_characters(test_string) == 7
assert count_non_gsm_characters(test_string, True) == 5


class TestCoding(unittest.TestCase):
def test_roundtrip(self):
Expand Down