Skip to content

Commit

Permalink
Merge pull request #4 from hoganbc/library
Browse files Browse the repository at this point in the history
Allow vose_sampler to be consumable as a library
  • Loading branch information
asmith26 authored Nov 2, 2018
2 parents 9bc95ef + 324d084 commit 370fe6e
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 41 deletions.
28 changes: 18 additions & 10 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
valid_folder = "tests/file_examples/valid_files/"
invalid_folder = "tests/file_examples/invalid_files/"

empty_file_error = "Error\: Please provide a file containing a corpus \(not an empty file\)."
binary_file_error = "Error\: Please provide a file containing text-based data."
nonnegative_integer_error = "Error\: Please enter a non-negative integer for the number of samples desired\: "
empty_file_error = "Please provide a file containing a corpus \(not an empty file\)."
binary_file_error = "Please provide a file containing text-based data."
nonnegative_integer_error = "Please enter a non-negative integer for the number of samples desired\: "


class TestValidation(unittest.TestCase):
Expand All @@ -25,39 +25,39 @@ class TestValidation(unittest.TestCase):

def test_empty_file(self):
"""Test vose_sampler.get_words against empty files """
self.assertRaisesRegexp(SystemExit, empty_file_error, vose_sampler.get_words, invalid_folder + "empty.txt")
self.assertRaisesRegexp(IOError, empty_file_error, vose_sampler.get_words, invalid_folder + "empty.txt")

def test_binary_file1(self):
"""Test vose_sampler.get_words against .epub files """
self.assertRaisesRegexp(SystemExit, binary_file_error, vose_sampler.get_words, invalid_folder + "Alice.epub")
self.assertRaisesRegexp(IOError, binary_file_error, vose_sampler.get_words, invalid_folder + "Alice.epub")

def test_binary_file2(self):
"""Test vose_sampler.get_words against .mobi files """
self.assertRaisesRegexp(SystemExit, binary_file_error, vose_sampler.get_words, invalid_folder + "Alice.mobi")
self.assertRaisesRegexp(IOError, binary_file_error, vose_sampler.get_words, invalid_folder + "Alice.mobi")

def test_binary_file3(self):
"""Test vose_sampler.get_words against .pdf files """
self.assertRaisesRegexp(SystemExit, binary_file_error, vose_sampler.get_words, invalid_folder + "Alice.pdf")
self.assertRaisesRegexp(IOError, binary_file_error, vose_sampler.get_words, invalid_folder + "Alice.pdf")

def test_binary_file4(self):
"""Test vose_sampler.get_words against .wav files """
self.assertRaisesRegexp(SystemExit, binary_file_error, vose_sampler.get_words, invalid_folder + "zero.wav")
self.assertRaisesRegexp(IOError, binary_file_error, vose_sampler.get_words, invalid_folder + "zero.wav")

def test_negative_integer(self):
"""Test vose_sampler.VoseAlias.alias_generation against a size
specified by a negative integer. """
words = vose_sampler.get_words(valid_folder + "small.txt")
word_dist = vose_sampler.sample2dist(words)
VA_words = vose_sampler.VoseAlias(word_dist)
self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "-1", VA_words.sample_n, -1)
self.assertRaisesRegexp(ValueError, nonnegative_integer_error + "-1", VA_words.sample_n, -1)

def test_zero_integer(self):
"""Test vose_sampler.ProbDistribution.alias_generation against a size
defined by zero. """
words = vose_sampler.get_words(valid_folder + "small.txt")
word_dist = vose_sampler.sample2dist(words)
VA_words = vose_sampler.VoseAlias(word_dist)
self.assertRaisesRegexp(SystemExit, nonnegative_integer_error + "0", VA_words.sample_n, 0)
self.assertRaisesRegexp(ValueError, nonnegative_integer_error + "0", VA_words.sample_n, 0)


class TestAccuracy(unittest.TestCase):
Expand Down Expand Up @@ -127,6 +127,14 @@ def test_output_alias_generation(self):
alpha = 0.01
self.assertGreater(p, alpha)

def test_roundtrip(self):
dist = {"H": Decimal(0.2), "T": Decimal(0.8)}
VA = vose_sampler.VoseAlias(dist)
sample = VA.sample_n(100000)
computed_dist = vose_sampler.sample2dist(sample)
self.assertAlmostEqual(dist.get("H"), computed_dist.get("H"), delta=0.01)
self.assertAlmostEqual(dist.get("T"), computed_dist.get("T"), delta=0.01)


if __name__ == "__main__":
unittest.main()
60 changes: 29 additions & 31 deletions vose_sampler/vose_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,44 +78,35 @@ def alias_generation(self):
return self.table_alias[col]

def sample_n(self, size):
""" Retrun a sample of size n from the distribution, and print the results to stdout. """
""" Return a sample of size n from the distribution."""
# Ensure a non-negative integer as been specified
n = int(size)
try:
if n <= 0:
raise ValueError("Please enter a non-negative integer for the number of samples desired: %d" % n)
except ValueError as ve:
sys.exit("\nError: %s" % ve)
if n <= 0:
raise ValueError("Please enter a non-negative integer for the number of samples desired: %d" % n)

print("\nGenerating %d random samples:\n" % n)
for i in range(n):
print(self.alias_generation())
return [self.alias_generation() for i in range(n)]


#HELPER FUNCTIONS
def get_words(file):
""" (str) -> list
Return a list of words from a given corpus. """

try:
# Ensure the file is not empty
if os.stat(file).st_size == 0:
raise IOError("Please provide a file containing a corpus (not an empty file).")

# Ensure the file is text based (not binary). This is based on the implementation
# of the Linux file command
textchars = bytearray([7,8,9,10,12,13,27]) + bytearray(range(0x20, 0x100))
bin_file = open(file, "rb")
# Ensure the file is not empty
if os.stat(file).st_size == 0:
raise IOError("Please provide a file containing a corpus (not an empty file).")

# Ensure the file is text based (not binary). This is based on the implementation
# of the Linux file command
textchars = bytearray([7,8,9,10,12,13,27]) + bytearray(range(0x20, 0x100))
with open(file, "rb") as bin_file:
if bool(bin_file.read(2048).translate(None, textchars)):
raise IOError("Please provide a file containing text-based data.")
bin_file.close()

with open(file, "r") as corpus:
words = corpus.read().split()
return words
with open(file, "r") as corpus:
words = corpus.read().split()
return words

except IOError as ioe:
sys.exit("\nError: %s" % ioe)

def sample2dist(sample):
""" (list) -> dict (i.e {outcome:proportion})
Expand Down Expand Up @@ -144,7 +135,7 @@ def handle_options():
parser = OptionParser()
parser.add_option("-p", "--path", dest="path",
help="[REQUIRED] Path to corpus.", metavar="FILE")
parser.add_option("-n", "--num", dest="n",
parser.add_option("-n", "--num", dest="n", type=int,
help="[REQUIRED] Non-negative integer specifying how many samples are desired.", metavar="INT")

(options, args) = parser.parse_args()
Expand All @@ -155,13 +146,20 @@ def main():
# Handle command line arguments
options = handle_options()

# Construct distribution
words = get_words(options.path)
word_dist = sample2dist(words)
VA_words = VoseAlias(word_dist)
try:
# Construct distribution
words = get_words(options.path)
word_dist = sample2dist(words)
VA_words = VoseAlias(word_dist)

# Sample n words
print("\nGenerating %d random samples:\n" % options.n)
sample = VA_words.sample_n(options.n)
for s in sample:
print(s)
except Exception as e:
sys.exit("\nError: %s" % e)

# Sample n words
VA_words.sample_n(options.n)

if __name__ == "__main__":
main()

0 comments on commit 370fe6e

Please sign in to comment.