-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add files via upload #4
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
""" | ||
John Wen - Text Mining assignment | ||
|
||
""" | ||
|
||
import requests | ||
import random | ||
|
||
def gettext(url): | ||
""" maybe modify this to incorporate other websites | ||
if I have the time""" | ||
return requests.get(url).text | ||
|
||
Christmas = gettext('http://www.gutenberg.org/cache/epub/46/pg46.txt') | ||
#OliverTwist = gettext('http://www.gutenberg.org/ebooks/730.txt.utf-8') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove comments for your final code |
||
#DavidCopperfield = gettext('http://www.gutenberg.org/files/766/766-0.txt') | ||
#GreatExpectations = gettext('http://www.gutenberg.org/files/1400/1400-0.txt') | ||
#ATaleofTwoCities = gettext('http://www.gutenberg.org/files/98/98-0.txt') | ||
|
||
def cleanuplist(textlist): | ||
""" takes a text as a string and returns a list of words | ||
without any of the symbols and lowercased | ||
|
||
>>> cleanuplist('This project is so hard!') | ||
['this', 'project', 'is', 'so', 'hard'] | ||
>>> cleanuplist('I need, a bunch, of !? doctest?') | ||
['i', 'need', 'a', 'bunch', 'of', 'doctest'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice unit test! 👍 |
||
|
||
""" | ||
cleanedlist = [] | ||
textlist = textlist.lower().split() | ||
for word in textlist: | ||
symbols = "-_=+[}{]:;?/.>,<?!@#$%^&*()|'" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good way of implementing this function. I also suggest you look into regular expression if you want your code to be more concise! https://en.wikipedia.org/wiki/Regular_expression |
||
for i in range (0,len(symbols)): | ||
word = word.replace(symbols[i], '') | ||
if len(word) > 0: | ||
cleanedlist.append(word) | ||
return cleanedlist | ||
|
||
#print(len(cleanuplist(ATaleofTwoCities))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove comments |
||
|
||
|
||
def wordcounter(text): | ||
""" counts the text after it's cleaned up, ignoring symbols, | ||
by parsing through a list | ||
|
||
>>> wordcounter('This This This is is Another Docstring Test Test Is!!!!!!!!!') | ||
{'this': 3, 'is': 3, 'another': 1, 'docstring': 1, 'test': 2} | ||
|
||
""" | ||
cleanlist = cleanuplist(text) | ||
d = dict() | ||
for words in cleanlist: | ||
d[words] = d.get(words,0) + 1 | ||
return d | ||
|
||
|
||
def highestfreqword(d): | ||
""" a) create a list of the dict's keys and values; | ||
b) return the key and value with the max value as a tuple | ||
|
||
>>> highestfreqword({'the': 5, 'apple': 2, 'is' : 3, 'large' : 15}) | ||
('large', 15) | ||
|
||
""" | ||
v=list(d.values()) | ||
k=list(d.keys()) | ||
return k[v.index(max(v))], d[k[v.index(max(v))]] | ||
|
||
|
||
def topNvalues(d,n): | ||
""" returns the top N values of the text as tuples in a list | ||
|
||
>>> topNvalues({'the': 5, 'apple': 2, 'is' : 3, 'large' : 15},2) | ||
[('large', 15), ('the', 5)] | ||
|
||
""" | ||
newdictionary = d | ||
listoftups = [] | ||
while n > 0: | ||
listoftups.append(highestfreqword(newdictionary)) | ||
newdictionary.pop(highestfreqword(newdictionary)[0]) | ||
n = n - 1 | ||
return listoftups | ||
|
||
#print(topNvalues(wordcounter(ATaleofTwoCities),5)) | ||
|
||
def uniquewordsused(s): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. one way to make this function name more readable is by using camelcase convention like |
||
""" returns the number of unique words in the string | ||
this function takes a lot of computational power LIKE A LOT | ||
|
||
>>> uniquewordsused('There should be six words used') | ||
6 | ||
>>> uniquewordsused('There should be seven words used here') | ||
7 | ||
""" | ||
cleanlist = cleanuplist(s) | ||
counter = 0 | ||
index = 1 | ||
while index < len(cleanlist) + 1: | ||
if cleanlist[index - 1] not in cleanlist[index:]: | ||
if cleanlist[index - 1] not in cleanlist[:index - 1]: | ||
counter = counter + 1 | ||
index = index + 1 | ||
return counter | ||
|
||
def dictionaryofprefixes(s): | ||
""" Returns a dictionary with a list of words in the text as keys | ||
and empty list as values. Will be used as a prefix index for suffixdictionary function. | ||
Does not repeat words | ||
|
||
>>> dictionaryofprefixes('This This This is another doctring test') | ||
{'this': [], 'is': [], 'another': [], 'doctring': [], 'test': []} | ||
|
||
""" | ||
cleanlist = cleanuplist(s) | ||
d = {} | ||
for words in cleanlist: | ||
if words not in d: | ||
d[words] = [] | ||
return d | ||
|
||
|
||
|
||
def suffixdictionary(s): | ||
""" Takes the premade dictionary key index and starts appending | ||
suffixes to the list of values for each key | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So what is this function returning? I think you can be more clear with your docstring. |
||
|
||
>>> suffixdictionary('This cat is this cars best friend. This docstring is this assignments') | ||
{'this': ['cat', 'cars', 'docstring', 'assignments'], 'cat': ['is'], 'is': ['this'], 'cars': ['best'], 'best': ['friend'], 'friend': ['this'], 'docstring': ['is'], 'assignments': []} | ||
|
||
""" | ||
d = dictionaryofprefixes(s) | ||
cleanlist = cleanuplist(s) | ||
index = 0 | ||
while index < len(cleanlist) - 1: | ||
if cleanlist[index + 1] not in d[cleanlist[index]]: | ||
d[cleanlist[index]].append(cleanlist[index + 1]) | ||
index = index + 1 | ||
return d | ||
|
||
#print(suffixes('This is a practice for a dictionary prefixes example so a cat can compute this program!!!')) | ||
|
||
|
||
def sentencegenerator(text,startword,length = 10): | ||
""" Takes a sentence generator that only takes one prefix | ||
and generates a random suffix from the dictionary and creates | ||
a setence of values""" | ||
d = suffixdictionary(text) | ||
sentence = [startword] | ||
index = 0 | ||
while index < length: | ||
sentence.append(random.choice(d[sentence[index]])) | ||
index = index + 1 | ||
return ' '.join(sentence) | ||
|
||
|
||
print(sentencegenerator((Christmas),'the',20)) | ||
|
||
|
||
if __name__ == "__main__": | ||
import doctest | ||
doctest.testmod(verbose=False) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This docstring does not explain what this function is about. Please refer to Oliver's GeneFinder solution. https://github.com/sd17fall/GeneFinder/blob/formatted/gene_finder.py