diff --git a/pyproject.toml b/pyproject.toml index 5c431493..4d8e71e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-jobspy" -version = "1.1.35" +version = "1.1.36" description = "Job scraper for LinkedIn, Indeed, Glassdoor & ZipRecruiter" authors = ["Zachary Hampton ", "Cullen Watson "] homepage = "https://github.com/Bunsly/JobSpy" diff --git a/src/jobspy/jobs/__init__.py b/src/jobspy/jobs/__init__.py index f018b1e7..db600f14 100644 --- a/src/jobspy/jobs/__init__.py +++ b/src/jobspy/jobs/__init__.py @@ -1,7 +1,7 @@ -from typing import Union, Optional +from typing import Optional from datetime import date from enum import Enum -from pydantic import BaseModel, validator +from pydantic import BaseModel class JobType(Enum): diff --git a/src/jobspy/scrapers/glassdoor/__init__.py b/src/jobspy/scrapers/glassdoor/__init__.py index 49099b29..69dd3e44 100644 --- a/src/jobspy/scrapers/glassdoor/__init__.py +++ b/src/jobspy/scrapers/glassdoor/__init__.py @@ -14,7 +14,7 @@ from .. import Scraper, ScraperInput, Site from ..exceptions import GlassdoorException -from ..utils import create_session +from ..utils import create_session, modify_and_get_description from ...jobs import ( JobPost, Compensation, @@ -200,9 +200,7 @@ def fetch_job_description(self, job_id): data = response.json()[0] desc = data['data']['jobview']['job']['description'] soup = BeautifulSoup(desc, 'html.parser') - description = soup.get_text(separator='\n') - - return description + return modify_and_get_description(soup) @staticmethod def parse_compensation(data: dict) -> Optional[Compensation]: @@ -292,12 +290,11 @@ def get_job_type_enum(job_type_str: str) -> list[JobType] | None: for job_type in JobType: if job_type_str in job_type.value: return [job_type] - return None @staticmethod - def parse_location(location_name: str) -> Location: + def parse_location(location_name: str) -> Location | None: if not location_name or location_name == "Remote": - return None + return city, _, state = location_name.partition(", ") return Location(city=city, state=state) @@ -306,7 +303,6 @@ def get_cursor_for_page(pagination_cursors, page_num): for cursor_data in pagination_cursors: if cursor_data["pageNumber"] == page_num: return cursor_data["cursor"] - return None @staticmethod def headers() -> dict: diff --git a/src/jobspy/scrapers/indeed/__init__.py b/src/jobspy/scrapers/indeed/__init__.py index ef7d3f22..eeb7ff89 100644 --- a/src/jobspy/scrapers/indeed/__init__.py +++ b/src/jobspy/scrapers/indeed/__init__.py @@ -21,6 +21,7 @@ extract_emails_from_text, create_session, get_enum_from_job_type, + modify_and_get_description ) from ...jobs import ( JobPost, @@ -247,9 +248,7 @@ def get_description(self, job_page_url: str) -> str | None: return None soup = BeautifulSoup(job_description, "html.parser") - text_content = "\n".join(soup.stripped_strings) - - return text_content + return modify_and_get_description(soup) @staticmethod def get_job_type(job: dict) -> list[JobType] | None: diff --git a/src/jobspy/scrapers/linkedin/__init__.py b/src/jobspy/scrapers/linkedin/__init__.py index 882ee1d7..dcdac96a 100644 --- a/src/jobspy/scrapers/linkedin/__init__.py +++ b/src/jobspy/scrapers/linkedin/__init__.py @@ -4,23 +4,36 @@ This module contains routines to scrape LinkedIn. """ +import time import random from typing import Optional from datetime import datetime import requests -import time from requests.exceptions import ProxyError -from bs4 import BeautifulSoup -from bs4.element import Tag from threading import Lock +from bs4.element import Tag +from bs4 import BeautifulSoup from urllib.parse import urlparse, urlunparse from .. import Scraper, ScraperInput, Site from ..exceptions import LinkedInException from ..utils import create_session -from ...jobs import JobPost, Location, JobResponse, JobType, Country, Compensation -from ..utils import count_urgent_words, extract_emails_from_text, get_enum_from_job_type, currency_parser +from ...jobs import ( + JobPost, + Location, + JobResponse, + JobType, + Country, + Compensation +) +from ..utils import ( + count_urgent_words, + extract_emails_from_text, + get_enum_from_job_type, + currency_parser, + modify_and_get_description +) class LinkedInScraper(Scraper): @@ -213,7 +226,7 @@ def get_job_description( description = None if div_content: - description = "\n".join(line.strip() for line in div_content.get_text(separator="\n").splitlines() if line.strip()) + description = modify_and_get_description(div_content) def get_job_type( soup_job_type: BeautifulSoup, diff --git a/src/jobspy/scrapers/utils.py b/src/jobspy/scrapers/utils.py index 862ff783..84e4c0b6 100644 --- a/src/jobspy/scrapers/utils.py +++ b/src/jobspy/scrapers/utils.py @@ -8,6 +8,15 @@ from ..jobs import JobType +def modify_and_get_description(soup): + for li in soup.find_all('li'): + li.string = "- " + li.get_text() + + description = soup.get_text(separator='\n').strip() + description = re.sub(r'\n+', '\n', description) + return description + + def count_urgent_words(description: str) -> int: """ Count the number of urgent words or phrases in a job description. diff --git a/src/jobspy/scrapers/ziprecruiter/__init__.py b/src/jobspy/scrapers/ziprecruiter/__init__.py index df75be5f..16a67f30 100644 --- a/src/jobspy/scrapers/ziprecruiter/__init__.py +++ b/src/jobspy/scrapers/ziprecruiter/__init__.py @@ -10,14 +10,13 @@ from datetime import datetime, date from typing import Optional, Tuple, Any -import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from .. import Scraper, ScraperInput, Site from ..exceptions import ZipRecruiterException -from ..utils import count_urgent_words, extract_emails_from_text, create_session from ...jobs import JobPost, Compensation, Location, JobResponse, JobType, Country +from ..utils import count_urgent_words, extract_emails_from_text, create_session, modify_and_get_description class ZipRecruiterScraper(Scraper): @@ -107,9 +106,9 @@ def process_job(job: dict) -> JobPost: title = job.get("name") job_url = job.get("job_url") - description = BeautifulSoup( - job.get("job_description", "").strip(), "html.parser" - ).get_text(separator="\n") + job_description_html = job.get("job_description", "").strip() + description_soup = BeautifulSoup(job_description_html, "html.parser") + description = modify_and_get_description(description_soup) company = job["hiring_company"].get("name") if "hiring_company" in job else None country_value = "usa" if job.get("job_country") == "US" else "canada"