-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PEP8 Fixes for Summarization. #1017
Changes from 4 commits
f051cbf
c3b772e
c2e20c9
fdcbc75
482ab89
1298db6
f731f19
b21068f
fc3a8af
19312fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
|
||
# bring model classes directly into package namespace, to save some typing | ||
from .summarizer import summarize, summarize_corpus | ||
from .keywords import keywords | ||
from .keywords import keywords |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,5 +16,6 @@ def build_graph(sequence): | |
|
||
def remove_unreachable_nodes(graph): | ||
for node in graph.nodes(): | ||
if sum(graph.edge_weight((node, other)) for other in graph.neighbors(node)) == 0: | ||
if sum(graph.edge_weight((node, other)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't check for lenght of line |
||
for other in graph.neighbors(node)) == 0: | ||
graph.del_node(node) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -161,7 +161,8 @@ def _get_combined_keywords(_keywords, split_text): | |
if word in _keywords: | ||
combined_word = [word] | ||
if i + 1 == len_text: | ||
result.append(word) # appends last word if keyword and doesn't iterate | ||
# appends last word if keyword and doesn't iterate | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
result.append(word) | ||
for j in xrange(i + 1, len_text): | ||
other_word = _strip_word(split_text[j]) | ||
if other_word in _keywords and other_word == split_text[j] and not other_word in combined_word: | ||
|
@@ -197,7 +198,16 @@ def _format_results(_keywords, combined_keywords, split, scores): | |
return "\n".join(combined_keywords) | ||
|
||
|
||
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False): | ||
def keywords( | ||
text, | ||
ratio=0.2, | ||
words=None, | ||
split=False, | ||
scores=False, | ||
pos_filter=[ | ||
'NN', | ||
'JJ'], | ||
lemmatize=False): | ||
# Gets a dict of word -> lemma | ||
text = to_unicode(text) | ||
tokens = _clean_text_by_word(text) | ||
|
@@ -210,7 +220,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= | |
|
||
_remove_unreachable_nodes(graph) | ||
|
||
# Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score | ||
# Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> | ||
# score | ||
pagerank_scores = _pagerank(graph) | ||
|
||
extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) | ||
|
@@ -225,7 +236,8 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= | |
|
||
keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) | ||
|
||
# text.split() to keep numbers and punctuation marks, so separeted concepts are not combined | ||
# text.split() to keep numbers and punctuation marks, so separeted | ||
# concepts are not combined | ||
combined_keywords = _get_combined_keywords(keywords, text.split()) | ||
|
||
return _format_results(keywords, combined_keywords, split, scores) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -110,12 +110,17 @@ def _get_sentences_with_word_count(sentences, word_count): | |
return selected_sentences | ||
|
||
|
||
def _extract_important_sentences(sentences, corpus, important_docs, word_count): | ||
def _extract_important_sentences( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Definitely not. |
||
sentences, | ||
corpus, | ||
important_docs, | ||
word_count): | ||
important_sentences = _get_important_sentences(sentences, corpus, important_docs) | ||
|
||
# If no "word_count" option is provided, the number of sentences is | ||
# reduced by the provided ratio. Else, the ratio is ignored. | ||
return important_sentences if word_count is None else _get_sentences_with_word_count(important_sentences, word_count) | ||
return important_sentences if word_count is None else _get_sentences_with_word_count( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. |
||
important_sentences, word_count) | ||
|
||
|
||
def _format_results(extracted_sentences, split): | ||
|
@@ -158,7 +163,8 @@ def summarize_corpus(corpus, ratio=0.2): | |
_set_graph_edge_weights(graph) | ||
_remove_unreachable_nodes(graph) | ||
|
||
# Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends. | ||
# Cannot calculate eigenvectors if number of unique words in text < 3. | ||
# Warns user to add more text. The function ends. | ||
if len(graph.nodes()) < 3: | ||
logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") | ||
return | ||
|
@@ -198,10 +204,11 @@ def summarize(text, ratio=0.2, word_count=None, split=False): | |
logger.warning("Input text is empty.") | ||
return | ||
|
||
# If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). | ||
# If only one sentence is present, the function raises an error (Avoids | ||
# ZeroDivisionError). | ||
if len(sentences) == 1: | ||
raise ValueError("input must have more than one sentence") | ||
|
||
# Warns if the text is too short. | ||
if len(sentences) < INPUT_MIN_LENGTH: | ||
logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No vertical indent.
Splitting the mega-expression into something saner (subexpressions) will help both readability and line length.