From bb32715b51ffb3fdd414aee9da8339cd56dc9662 Mon Sep 17 00:00:00 2001 From: Slater Podgorny Date: Wed, 29 May 2024 11:52:59 -0700 Subject: [PATCH] Remove web scraping from rhub.py --- elm/web/rhub.py | 711 +++++------------------------------------------- 1 file changed, 61 insertions(+), 650 deletions(-) diff --git a/elm/web/rhub.py b/elm/web/rhub.py index 8f5f339e..6f16c399 100644 --- a/elm/web/rhub.py +++ b/elm/web/rhub.py @@ -4,634 +4,15 @@ import os import os.path import logging -from urllib.request import urlopen import json import math import re import requests import pandas as pd -from bs4 import BeautifulSoup logger = logging.getLogger(__name__) -class ResearchOutputs(): - """Class to handle publications portion of the NREL researcher hub.""" - BASE_URL = "https://research-hub.nrel.gov/en/publications/?page=0" - - def __init__(self, url, n_pages=1, txt_dir='./ew_txt'): - """ - Parameters - ---------- - url : str - Research hub publications URL, most likely - https://research-hub.nrel.gov/en/publications/ - n_pages : int - Number of pages to get from the API. Typical response has 50 - entries per page. Default of 1 ensures that this class doesnt hang - on a million responses. - txt_dir : str - File directory where you would like to save output .txt files. - """ - - self.text_dir = txt_dir - self.all_links = [] - for p in range(0, n_pages): - url = url + f"?page={p}" - html = self.html_response(url) - self.soup = BeautifulSoup(html, "html.parser") - - self.target = self.soup.find('ul', {'class': 'list-results'}) - self.docs = self.target.find_all('a', {'class': 'link'}) - - page_links = [d['href'] for d in self.docs if - '/publications/' in d['href']] - self.all_links.extend(page_links) - - def html_response(self, url): - """Function to retrieve html response. - - Parameters - ---------- - url : str - URL of interest. - - Returns - ------- - html : str - HTML response output. - """ - with urlopen(url) as page: - html = page.read().decode("utf-8") - - return html - - def _scrape_authors(self, soup_inst): - """Scrape the names of authors associated with given publication. - - Parameters - ---------- - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance for the url associated with a - given publication. - - Returns - ------- - authors : list - List of all authors (strings) that contributed to publication. - """ - - authors = soup_inst.find('p', {'class': 'relations persons'}).text - - return authors - - def _scrape_links(self, soup_inst): - """Scrape the links under 'Access to Document' header - for a publication. - - Parameters - ---------- - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance for the url associated with a - given publication. - - Returns - ------- - doi link : str - DOI link for a reference if it exists. - pdf link : str - PDF link for a reference if it exists - """ - - doi_target = soup_inst.find('ul', {'class': 'dois'}) - if doi_target: - doi = doi_target.find('a')['href'] - else: - doi = '' - - pdf_target = soup_inst.find('ul', {'class': 'links'}) - if pdf_target: - pdf = pdf_target.find('a')['href'] - else: - pdf = '' - - return doi, pdf - - def _scrape_category(self, soup_inst): - """Scrape the category (ex: Technical Report, Journal Article, etc) - for a given publication. - - Parameters - ---------- - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance for the url associated with a - given publication. - - Returns - ------- - category : str - Publication category for a given record. - """ - - try: - category = soup_inst.find('span', - {'class': - 'type_classification'}).text - except AttributeError: - category = soup_inst.find('span', - {'class': - 'type_classification_parent'}).text - - return category - - def _scrape_year(self, soup_inst): - """Scrape publication year for a given publication. - - Parameters - ---------- - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance for the url associated with a - given publication. - - Returns - ------- - year : str - The year a record was published. - """ - year = soup_inst.find('span', {'class': 'date'}).text - - return year - - def _scrape_id(self, soup_inst): - """Scrape the NREL Publication Number for a given publication. - - Parameters - ---------- - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance for the url associated with a - given publication. - - Returns - ------- - NREL Publication Number: str - Publication number for a record, unique identifier. - """ - - nrel_id = soup_inst.find('ul', {'class': 'relations keywords'}).text - - return nrel_id - - def build_meta(self): - """Build a meta dataframe containing relevant information - for publications. - - Returns - ------- - publications_meta : pd.DataFrame - Dataframe containing metadata for publications. - """ - publications_meta = pd.DataFrame(columns=('title', 'nrel_id', - 'authors', 'year', - 'url', 'doi', - 'pdf_url', 'category')) - for link in self.all_links[:20]: # quantity control here # - with urlopen(link) as page: - html = page.read().decode("utf-8") - meta_soup = BeautifulSoup(html, "html.parser") - - title = meta_soup.find('h1').text - nrel_id = self._scrape_id(meta_soup) - authors = self._scrape_authors(meta_soup) - doi = self._scrape_links(meta_soup)[0] - pdf_url = self._scrape_links(meta_soup)[1] - category = self._scrape_category(meta_soup) - year = self._scrape_year(meta_soup) - - new_row = {'title': title, - 'nrel_id': nrel_id, - 'year': year, - 'authors': authors, - 'url': link, - 'doi': doi, - 'pdf_url': pdf_url, - 'category': category - } - - publications_meta.loc[len(publications_meta)] = new_row - - return publications_meta - - def download_pdf(self, pdf_dir, txt_dir, soup_inst): - """Downloads a pdf for a given link - - Parameters - ---------- - out_dir: str - Directory where the .pdf files should be saved. - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance used to locate pdf url. - """ - pdf_target = soup_inst.find('ul', {'class': 'links'}) - if pdf_target: - pdf_url = pdf_target.find('a')['href'] - - fn = os.path.basename(pdf_url) - fp_out = os.path.join(pdf_dir, fn) - - if pdf_url and pdf_url.endswith('.pdf'): - if not os.path.exists(fp_out): - session = requests.Session() - response = session.get(pdf_url) - with open(fp_out, 'wb') as f_pdf: - f_pdf.write(response.content) - logger.info('Downloaded {}'.format(fn)) - else: - logger.info('{} has already been downloaded'.format(fn)) - elif not pdf_url.endswith('.pdf'): - parent_url = soup_inst.find(property="og:url")['content'] - fn = os.path.basename(parent_url) + '_abstract.txt' - logger.info('No PDF file for {}. Processing abstract.'.format(fn)) - self.scrape_abstract(txt_dir, fn, soup_inst) - - def scrape_abstract(self, out_dir, fn, soup_inst): - """Scrapes abstract for a provided publication - - Parameters - ---------- - out_dir: str - Directory where the .txt files should be saved. - fn: str - File name for saving the file. - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance used for scraping. - """ - out_fp = os.path.join(out_dir, fn) - if not os.path.exists(out_fp): - title = soup_inst.find('h1').text - target = soup_inst.find('h2', string='Abstract') - if target: - abstract = target.find_next_siblings()[0].text - full_txt = (f'The report titled {title} can be ' - f'summarized as follows: {abstract}') - with open(out_fp, "w") as text_file: - text_file.write(full_txt) - else: - logger.info('Abstract not found for {}'.format(fn)) - else: - logger.info('{} has already been processed.'.format(out_fp)) - - def scrape_publications(self, pdf_dir, txt_dir): - """Downloads pdfs for all Technical Reports and scrapes abstracts - for all other publications listed. - - Parameters - ---------- - pdf_dir: str - Directory where the .pdf files should be saved. - txt_dir: str - Directory where the .txt files should be saved. - """ - - os.makedirs(pdf_dir, exist_ok=True) - os.makedirs(txt_dir, exist_ok=True) - url_list = self.all_links[:20] # quantity control here # - - for pub in url_list: - with urlopen(pub) as page: - html = page.read().decode("utf-8") - pubs_soup = BeautifulSoup(html, "html.parser") - - category = self._scrape_category(pubs_soup) - - if category == 'Technical Report': - self.download_pdf(pdf_dir, txt_dir, pubs_soup) - else: - fn = os.path.basename(pub) + '_abstract.txt' - self.scrape_abstract(txt_dir, fn, pubs_soup) - - return logger.info('Finished processing publications') - - -class ResearcherProfiles(): - """ - Class to handle researcher profiles portion of the NREL researcher hub. - """ - BASE_URL = "https://research-hub.nrel.gov/en/persons/?page=0" - - def __init__(self, url, n_pages=1, txt_dir='./ew_txt'): - """ - Parameters - ---------- - url : str - Research hub profiles URL, most likely - https://research-hub.nrel.gov/en/persons/ - n_pages : int - Number of pages to get from the API. Typical response has 50 - entries per page. Default of 1 ensures that this class doesnt hang - on a million responses. - txt_dir : str - File directory where you would like to save output .txt files. - """ - - self.text_dir = txt_dir - self.profile_links = [] - for p in range(0, n_pages): - url_base = url + f"?page={p}" - with urlopen(url_base) as page: - html = page.read().decode("utf-8") - soup = BeautifulSoup(html, "html.parser") - - target = soup.find('ul', {'class': 'grid-results'}) - docs = target.find_all('a', {'class': 'link'}) - - page_links = [d['href'] for d in docs if '/persons/' in d['href']] - self.profile_links.extend(page_links) - - def build_meta(self): - """Build a meta dataframe containing relevant information for - researchers. - - Returns - ------- - profiles_meta : pd.DataFrame - Dataframe containing metadata for researcher profiles. - """ - url_list = self.profile_links - profiles_meta = pd.DataFrame(columns=('title', 'nrel_id', - 'email', 'url', 'fn', - 'category' - )) - for link in url_list[:20]: # quantity control here # - with urlopen(link) as page: - html = page.read().decode("utf-8") - meta_soup = BeautifulSoup(html, "html.parser") - - title = meta_soup.find('h1').text - email_target = meta_soup.find('a', {'class': 'email'}) - if email_target: - email = meta_soup.find('a', - {'class': 'email'} - ).text.replace('nrelgov', '@nrel.gov') - else: - email = '' - id = os.path.basename(link) - fn = os.path.basename(link) + '.txt' - - new_row = {'title': title, - 'nrel_id': id, - 'email': email, - 'url': link, - 'fn': fn, - 'category': 'Researcher Profile' - } - - profiles_meta.loc[len(profiles_meta)] = new_row - - return profiles_meta - - def _scrape_title(self, soup_inst): - """Scrapes name and position for each researcher. - - Parameters - ---------- - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance for the url associated with a - given researcher. - - Returns - ------- - intro : str - String containing researchers name and position. - """ - - r = soup_inst.find('h1').text - - if soup_inst.find('span', {'class': 'job-title'}): - j = soup_inst.find('span', {'class': 'job-title'}).text - intro = (f'The following is brief biography for {r} ' - f'who is a {j} at the National Renewable Energy ' - f'Laboratory:\n') - else: - intro = (f'The following is brief biography for {r}' - f'who works for the National Renewable Energy ' - f'Laboratory:\n') - - return intro - - def _scrape_bio(self, soup_inst): - """Scrapes 'Personal Profile' section for each researcher. - - Parameters - ---------- - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance for the url associated with a - given researcher. - - Returns - ------- - bio : str - String containing background text from profile. - """ - target = soup_inst.find('h3', string="Personal Profile") - - bio = '' - if target: - for sib in target.find_next_siblings(): - if sib.name == "h3": - break - bio = bio + sib.text - - return bio - - def _scrape_lists(self, soup_inst, heading): - """Scrapes sections such as 'Professional Experience' and - 'Research Interests' - - Parameters - ---------- - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance for the url associated with a - given researcher. - heading: str - Section to scrape. Should be 'Professional Experience' or - 'Research Interests' - - Returns - ------- - text : str - String containing contents from the experience section. - """ - r = soup_inst.find('h1').text - target = soup_inst.find('h3', string=heading) - - exp_list = [] - - if target: - for sib in target.find_next_siblings(): - exp_list.append(sib.text) - - exp = ', '.join(exp_list) - - text = f"{r}'s {heading} includes the following:\n{exp} " - else: - text = '' - - return text - - def _scrape_education(self, soup_inst): - """Scrapes and reformats 'Education/Academic Qualification' - section for each researcher. - - Parameters - ---------- - soup_inst : bs4.BeautifulSoup - Instantiated beautiful soup instance for the url associated with a - given researcher. - - Returns - ------- - full_text : str - String containing researcher's education (level, focus, - and institution). - """ - r = soup_inst.find('h1').text - target = soup_inst.find('h3', - string='Education/Academic Qualification') - - full_text = '' - if target: - for sib in target.find_next_siblings(): - t = sib.text - if len(t.split(',')) >= 3: - level = t.split(',')[0] - deg = t.split(',')[1] - inst = ','.join(t.split(',')[2:]) - - text = (f"{r} received a {level} degree in {deg} " - f"from the {inst}. ") - elif len(t.split(',')) == 2: - level = t.split(',')[0] - inst = t.split(',')[1] - - text = f"{r} received a {level} degree from the {inst}. " - - full_text = full_text + text - - return full_text - - def _scrape_publications(self, profile_url): - """Scrapes the name of each publication that a - researcher contributed to. - - Parameters - ---------- - profile_url : str - Link to a specific researchers profile. - - Returns - ------- - text : str - String containing names of all publications for a given researcher. - """ - pubs_url = profile_url + '/publications/' - with urlopen(pubs_url) as page: - html = page.read().decode("utf-8") - pubs_soup = BeautifulSoup(html, "html.parser") - - r = pubs_soup.find('h1').text - target = pubs_soup.find_all('h3', {'class': 'title'}) - - pubs = [] - if target: - for p in target: - pubs.append(p.text) - - pubs = ', '.join(pubs) - text = (f'{r} has contributed to the following ' - f'publications: {pubs}.') - else: - text = '' - - return text - - def _scrape_similar(self, profile_url): - """Scrapes the names listed under the 'Similar Profiles' section. - - Parameters - ---------- - profile_url : str - Link to a specific researchers profile. - - Returns - ------- - text : str - String containing names of similar researchers. - """ - sim_url = profile_url + '/similar/' - with urlopen(sim_url) as sim_page: - sim_html = sim_page.read().decode("utf-8") - sim_soup = BeautifulSoup(sim_html, "html.parser") - - r = sim_soup.find('h1').text - target = sim_soup.find_all('h3', {'class': 'title'}) - - similar = [] - if target: - for p in target: - similar.append(p.text) - - similar = ', '.join(similar) - text = f'{r} has worked on projects with {similar}.' - else: - text = '' - - return text - - def scrape_profiles(self, out_dir): - """Scrapes profiles for each researcher. - - Parameters - ---------- - out_dir: str - Directory where the .txt files should be saved. - """ - os.makedirs(out_dir, exist_ok=True) - url_list = self.profile_links[:20] # quantity control here # - - for i, prof in enumerate(url_list): - f = os.path.basename(prof) + '.txt' - txt_fp = os.path.join(out_dir, f) - if not os.path.exists(txt_fp): - with urlopen(prof) as page: - html = page.read().decode("utf-8") - prof_soup = BeautifulSoup(html, "html.parser") - - r = prof_soup.find('h1').text - - intro = self._scrape_title(prof_soup) - bio = self._scrape_bio(prof_soup) - exp = self._scrape_lists(prof_soup, 'Professional Experience') - interests = self._scrape_lists(prof_soup, 'Research Interests') - edu = self._scrape_education(prof_soup) - pubs = self._scrape_publications(prof) - similar = self._scrape_similar(prof) - - full_txt = (intro + bio + '\n' + exp + '\n' - + interests + '\n' + edu + '\n' - + pubs + '\n' + similar) - - with open(txt_fp, "w") as text_file: - text_file.write(full_txt) - logger.info('Profile {}/{}: {} saved to ' - '{}'.format(i + 1, len(url_list), - r, txt_fp)) - - else: - logger.info('Profile {}/{} already ' - 'exists.'.format(i + 1, len(url_list))) - return logger.info('Finished processing profiles') - - class ProfilesRecord(dict): """Class to handle a single profiles as dictionary data""" def __init__(self, record): @@ -650,6 +31,7 @@ def __init__(self, record): @staticmethod def clean_text(html_text): """Clean html text from API response + Parameters ---------- html_text : str @@ -667,6 +49,7 @@ def clean_text(html_text): @property def title(self): """Get the full name of this researcher. + Returns ------- full : str @@ -682,22 +65,25 @@ def title(self): @property def email(self): """Get the email address of this researcher. + Returns ------- email : str Email address of researcher. """ + email = None orgs = self.get('staffOrganisationAssociations') if orgs: emails_dict = orgs[0].get('emails') if emails_dict: email = emails_dict[0].get('value').get('value') - return email + return email @property def url(self): """Get the url or this researcher's profile. + Returns ------- url : str @@ -711,6 +97,7 @@ def url(self): @property def id(self): """Get API ID of researcher. + Returns ------- id : str @@ -724,23 +111,26 @@ def id(self): @property def position(self): """Get the position of this researcher. + Returns ------- position : str Researcher's position. """ + position = None org = self.get('staffOrganisationAssociations') if org: info = org[0].get('jobDescription') text = info.get('text')[0] position = text.get('value') - return position + return position @property def profile_information(self): """Get key profile information for this record: Personal Profile, Research Interests, Professional Experience. + Returns ------- bio : str @@ -779,6 +169,7 @@ def profile_information(self): @property def education(self): """Get the education information of this researcher. + Returns ------- levels : list @@ -794,40 +185,44 @@ def education(self): if edu: for e in edu: - if e.get('projectTitle'): - quali = e.get('qualification') - level = quali.get('term').get('text')[0].get('value') - deg = e.get('projectTitle').get('text')[0].get('value') - org = e.get('organisationalUnits') - - if org: - value = org[0].get('externalOrganisationalUnit') - name = value.get('name') - school = name.get('text')[0].get('value') + try: + if e.get('projectTitle'): + quali = e.get('qualification') + level = quali.get('term').get('text')[0].get('value') + deg = e.get('projectTitle').get('text')[0].get('value') + org = e.get('organisationalUnits') + + if org: + value = org[0].get('externalOrganisationalUnit') + name = value.get('name') + school = name.get('text')[0].get('value') + else: + deg_school = deg + deg = deg_school.split(',')[0] + school = deg_school.split(',')[1] + + deg_string = (f'{researcher_name} has a {level} ' + f'degree in {deg} from {school}. ') + out_strings.append(deg_string) else: - deg_school = deg - deg = deg_school.split(',')[0] - school = deg_school.split(',')[1] - - deg_string = (f'{researcher_name} has a {level} ' - f'degree in {deg} from {school}. ') - out_strings.append(deg_string) - else: - quali = e.get('qualification') - level = quali.get('term').get('text')[0].get('value') - org = e.get('organisationalUnits')[0] - name = org.get('externalOrganisationalUnit').get('name') - school = name.get('text')[0].get('value') + quali = e.get('qualification') + level = quali.get('term').get('text')[0].get('value') + org = e.get('organisationalUnits')[0] + name = org.get('externalOrganisationalUnit').get('name') + school = name.get('text')[0].get('value') - deg_string = (f'{researcher_name} has a {level} ' - f'degree from {school}. ') - out_strings.append(deg_string) + deg_string = (f'{researcher_name} has a {level} ' + f'degree from {school}. ') + out_strings.append(deg_string) + except Exception as e: + pass return out_strings @property def publications(self): """Get the publications this researcher contributed to. + Returns ------- pubs : list @@ -855,6 +250,11 @@ def publications(self): def download(self, fp): """Download text file containing researchers profile information. + + Parameters + ---------- + fp : str + Filepath to download this record to. """ name = self.title @@ -929,7 +329,8 @@ def _get_first(self): Returns ------- - list + first_page : list + First page of records as a list. """ self._response = self._session.get( self.url, @@ -1051,6 +452,7 @@ def __init__(self, record): @property def title(self): """Get the title of this publication. + Returns ------- title : str @@ -1063,6 +465,7 @@ def title(self): @property def year(self): """Get the publish year. + Returns ------- year : int @@ -1076,6 +479,7 @@ def year(self): @property def url(self): """Get the url associated with the publication. + Returns ------- url : str @@ -1090,6 +494,7 @@ def url(self): def id(self): """Get the 'NREL Publication Number' for this record. + Returns ------- id : str @@ -1105,6 +510,7 @@ def id(self): @property def authors(self): """Get the names of all authors for a publication. + Returns ------- out : str @@ -1129,7 +535,8 @@ def authors(self): @property def category(self): - """Get the category for this publication. + """Get the publication category for this record. + Returns ------- cat : str @@ -1144,6 +551,7 @@ def category(self): @property def links(self): """Get the doi and pdf links for a publication. + Returns ------- doi : str @@ -1167,6 +575,7 @@ def links(self): @property def abstract(self): """Get the abstract text for this publication. + Returns ------- value : str @@ -1181,6 +590,7 @@ def abstract(self): def save_abstract(self, abstract_text, out_fp): """Download abstract text to .txt file to the directory provided. + Parameters ---------- abstract_text : str @@ -1197,8 +607,9 @@ def save_abstract(self, abstract_text, out_fp): def download(self, pdf_dir, txt_dir): """Download PDFs and TXT files to the directories provided. If a record - does not fit the criteria for PDF download,a TXT file with the record + does not fit the criteria for PDF download, a TXT file with the record abstract will be saved to the TXT directory. + Parameters ---------- pdf_dir : str