From 5dc4c8b0bafeec85556c5f641a231b25db8e4d89 Mon Sep 17 00:00:00 2001 From: rka87338 Date: Wed, 8 May 2019 09:41:30 +0530 Subject: [PATCH 1/3] scrapped from wikiquote --- pyquotes/wikiscrap.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 pyquotes/wikiscrap.py diff --git a/pyquotes/wikiscrap.py b/pyquotes/wikiscrap.py new file mode 100644 index 0000000..6e06622 --- /dev/null +++ b/pyquotes/wikiscrap.py @@ -0,0 +1,42 @@ +from bs4 import BeautifulSoup +import requests + +source = requests.get("https://en.wikiquote.org/wiki/Main_Page").text +soup = BeautifulSoup(source,"lxml") +#names is list of all names with links in the main page of wikiquote +names = soup.find('div',class_='mw-parser-output').find_all('div')[11].find_all('p')[1].find_all('a') + +def get_quotes(person): + quotes_by_author = list() + for name in names: + if (person == name.text.lower()): + link = "https://en.wikiquote.org" + name['href'] + link = requests.get(link).text + soup_for_indiv = BeautifulSoup(link,"lxml") + quotes = soup_for_indiv.find_all('div',class_='mw-parser-output')[0].find_all('ul') + for quote in quotes: + try: + if (quote.li.b==None): #if there is no quote + continue + elif quote.li.b.text.isdigit()==True: #so that there aren't any numbers + continue + elif len(quote.li.b.text.split(' '))<2: #so that there aren't any words + continue + else: + temp = [quote.li.b.text,name.text] + quotes_by_author.append(tuple(temp)) + except: + continue + else: + continue + return quotes_by_author + +#scrapping for quote of the day +path_for_quote_of_the_day = soup.find_all('table')[2].find_all('tbody')[2].find_all('tr') +quote_of_the_day = path_for_quote_of_the_day[0].td.text +author_for_quote_of_the_day = path_for_quote_of_the_day[1].td.a.text +quote_of_the_day_tuple = (quote_of_the_day.rstrip(),author_for_quote_of_the_day) + +def get_quote_of_the_day(): + return quote_of_the_day_tuple + From 4682597291a22a6017e70125035f5b8b2baa33d3 Mon Sep 17 00:00:00 2001 From: rka87338 Date: Wed, 8 May 2019 11:18:43 +0530 Subject: [PATCH 2/3] changed things according to pep8 --- .vscode/settings.json | 5 +++++ pyquotes/wikiscrap.py | 22 ++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b5f2aa4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python.linting.pylintEnabled": false, + "python.linting.pep8Enabled": true, + "python.linting.enabled": true +} \ No newline at end of file diff --git a/pyquotes/wikiscrap.py b/pyquotes/wikiscrap.py index 6e06622..c88750a 100644 --- a/pyquotes/wikiscrap.py +++ b/pyquotes/wikiscrap.py @@ -2,9 +2,11 @@ import requests source = requests.get("https://en.wikiquote.org/wiki/Main_Page").text -soup = BeautifulSoup(source,"lxml") -#names is list of all names with links in the main page of wikiquote -names = soup.find('div',class_='mw-parser-output').find_all('div')[11].find_all('p')[1].find_all('a') +soup = BeautifulSoup(source, "lxml") +# names is list of all names with links in the main page of wikiquote +names_path = soup.find('div', class_='mw-parser-output').find_all('div')[11] +names = names_path.find_all('p')[1].find_all('a') + def get_quotes(person): quotes_by_author = list() @@ -12,18 +14,18 @@ def get_quotes(person): if (person == name.text.lower()): link = "https://en.wikiquote.org" + name['href'] link = requests.get(link).text - soup_for_indiv = BeautifulSoup(link,"lxml") - quotes = soup_for_indiv.find_all('div',class_='mw-parser-output')[0].find_all('ul') + soup_for_indiv = BeautifulSoup(link, "lxml") + quotes = soup_for_indiv.find_all('div', class_='mw-parser-output')[0].find_all('ul') for quote in quotes: - try: - if (quote.li.b==None): #if there is no quote + try: + if quote.li.b is None: continue - elif quote.li.b.text.isdigit()==True: #so that there aren't any numbers + elif quote.li.b.text.isdigit(): continue - elif len(quote.li.b.text.split(' '))<2: #so that there aren't any words + elif len(quote.li.b.text.split(' ')) < 2: continue else: - temp = [quote.li.b.text,name.text] + temp = [quote.li.b.text, name.text] quotes_by_author.append(tuple(temp)) except: continue From bd50d975a06f5a1e2eb8746930bff728265a936d Mon Sep 17 00:00:00 2001 From: rka87338 Date: Sun, 19 May 2019 21:24:02 +0530 Subject: [PATCH 3/3] output of get_quotes func in brainyquote.py is a tuple --- .vscode/settings.json | 5 ----- pyquotes/brainyquote.py | 7 ++----- pyquotes/wikiscrap.py | 26 +++++++++++++------------- 3 files changed, 15 insertions(+), 23 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index b5f2aa4..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "python.linting.pylintEnabled": false, - "python.linting.pep8Enabled": true, - "python.linting.enabled": true -} \ No newline at end of file diff --git a/pyquotes/brainyquote.py b/pyquotes/brainyquote.py index 1da25e1..3bd0c6c 100644 --- a/pyquotes/brainyquote.py +++ b/pyquotes/brainyquote.py @@ -51,17 +51,14 @@ def get_quotes(person, category): # Getting the quote of the related author get_quote = soup_author.find_all('a', attrs={'title': 'view quote'}) quote_list = [] - big_list = [] for i in range(count): - quote_list.append(get_quote[i].text) - big_list.append(quote_list) + quote_list.append((get_quote[i].text, person)) if len(quote_list) == 0: return('''Oops! It seems that there are no quotes of the author of that category. \nYou may consider changing the category or the author ''') - quote_list.append(person) - + quote_list = tuple(quote_list) return(quote_list) diff --git a/pyquotes/wikiscrap.py b/pyquotes/wikiscrap.py index c88750a..6b5c090 100644 --- a/pyquotes/wikiscrap.py +++ b/pyquotes/wikiscrap.py @@ -15,15 +15,16 @@ def get_quotes(person): link = "https://en.wikiquote.org" + name['href'] link = requests.get(link).text soup_for_indiv = BeautifulSoup(link, "lxml") - quotes = soup_for_indiv.find_all('div', class_='mw-parser-output')[0].find_all('ul') + q = soup_for_indiv.find_all('div', class_='mw-parser-output')[0] + quotes = q.find_all('ul') for quote in quotes: - try: - if quote.li.b is None: + try: + if quote.li.b is None: continue - elif quote.li.b.text.isdigit(): + elif quote.li.b.text.isdigit(): + continue + elif len(quote.li.b.text.split(' ')) < 2: continue - elif len(quote.li.b.text.split(' ')) < 2: - continue else: temp = [quote.li.b.text, name.text] quotes_by_author.append(tuple(temp)) @@ -33,12 +34,11 @@ def get_quotes(person): continue return quotes_by_author -#scrapping for quote of the day -path_for_quote_of_the_day = soup.find_all('table')[2].find_all('tbody')[2].find_all('tr') -quote_of_the_day = path_for_quote_of_the_day[0].td.text -author_for_quote_of_the_day = path_for_quote_of_the_day[1].td.a.text -quote_of_the_day_tuple = (quote_of_the_day.rstrip(),author_for_quote_of_the_day) +# scrapping for quote of the day +p = soup.find_all('table')[2].find_all('tbody')[2].find_all('tr') +quote_of_the_day = p[0].td.text +author_for_quote_of_the_day = p[1].td.a.text -def get_quote_of_the_day(): - return quote_of_the_day_tuple +def get_quote_of_the_day(): + return (quote_of_the_day.rstrip(), author_for_quote_of_the_day)