forked from aneesha/RAKE
-
Notifications
You must be signed in to change notification settings - Fork 222
/
rake_tutorial.py
68 lines (49 loc) · 2.47 KB
/
rake_tutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from __future__ import absolute_import
from __future__ import print_function
import six
__author__ = 'a_medelyan'
import rake
import operator
import io
# EXAMPLE ONE - SIMPLE
stoppath = "data/stoplists/SmartStoplist.txt"
# 1. initialize RAKE by providing a path to a stopwords file
rake_object = rake.Rake(stoppath, 5, 3, 4)
# 2. run on RAKE on a given text
sample_file = io.open("data/docs/fao_test/w2167e.txt", 'r',encoding="iso-8859-1")
text = sample_file.read()
keywords = rake_object.run(text)
# 3. print results
print("Keywords:", keywords)
print("----------")
# EXAMPLE TWO - BEHIND THE SCENES (from https://github.com/aneesha/RAKE/rake.py)
# 1. initialize RAKE by providing a path to a stopwords file
rake_object = rake.Rake(stoppath)
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility " \
"of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. " \
"Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating"\
" sets of solutions for all types of systems are given. These criteria and the corresponding algorithms " \
"for constructing a minimal supporting set of solutions can be used in solving all the considered types of " \
"systems and systems of mixed types."
# 1. Split text into sentences
sentenceList = rake.split_sentences(text)
for sentence in sentenceList:
print("Sentence:", sentence)
# generate candidate keywords
stopwords = rake.load_stop_words(stoppath)
stopwordpattern = rake.build_stop_word_regex(stoppath)
phraseList = rake.generate_candidate_keywords(sentenceList, stopwordpattern, stopwords)
print("Phrases:", phraseList)
# calculate individual word scores
wordscores = rake.calculate_word_scores(phraseList)
# generate candidate keyword scores
keywordcandidates = rake.generate_candidate_keyword_scores(phraseList, wordscores)
for candidate in keywordcandidates.keys():
print("Candidate: ", candidate, ", score: ", keywordcandidates.get(candidate))
# sort candidates by score to determine top-scoring keywords
sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
totalKeywords = len(sortedKeywords)
# for example, you could just take the top third as the final keywords
for keyword in sortedKeywords[0:int(totalKeywords / 3)]:
print("Keyword: ", keyword[0], ", score: ", keyword[1])
print(rake_object.run(text))