From af7f353c9de85b8abc27d7c01c67d98470b1be44 Mon Sep 17 00:00:00 2001 From: Divyansh Agarwal Date: Thu, 3 Jun 2021 19:48:55 +0000 Subject: [PATCH] Minior fix for novelguide pagination --- .../data_collection/novelguide/get_works.py | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/scripts/data_collection/novelguide/get_works.py b/scripts/data_collection/novelguide/get_works.py index d764f2b95..07b85e556 100644 --- a/scripts/data_collection/novelguide/get_works.py +++ b/scripts/data_collection/novelguide/get_works.py @@ -28,37 +28,40 @@ def scrape_index_pages(seed_page): scraped_links = [] for char in alphabet_list: + + page_no = 1 books_page = seed_page + char - page_no = 0 + while(True): - try: + try: - soup = BeautifulSoup(urllib.request.urlopen(books_page), "html.parser") - items = soup.findAll("ul", {"class": "search-title"}) - books = items[0].findAll("li") + soup = BeautifulSoup(urllib.request.urlopen(books_page), "html.parser") + items = soup.findAll("ul", {"class": "search-title"}) + books = items[0].findAll("li") - # print (books) + # # Go over each section + for index, item in enumerate(books): + # Parse section to get bullet point text - # # Go over each section - for index, item in enumerate(books): - # Parse section to get bullet point text + item_title = item.find("a").text + item_url = item.find("a").get("href") - item_title = item.find("a").text - item_url = item.find("a").get("href") + print ("item_title: ", item_title.strip()) + print ("item_url: ", item_url.strip()) + print ("\n") - print ("item_title: ", item_title.strip()) - print ("item_url: ", item_url.strip()) - print ("\n") + scraped_links.append({ + "title": item_title.strip(), + "url": urllib.parse.urljoin(MAIN_SITE, item_url.strip()) + }) - scraped_links.append({ - "title": item_title.strip(), - "url": urllib.parse.urljoin(MAIN_SITE, item_url.strip()) - }) + except Exception: + print ("No books found on page: ", books_page) + break - except Exception: - - print ("No books found with title: ", char) + books_page = seed_page + char + "?page=" + str(page_no) + page_no += 1 return scraped_links