Skip to content

Commit

Permalink
Fix issue caused by lxml misinterpreting malformed HTML
Browse files Browse the repository at this point in the history
  • Loading branch information
neon-ninja committed Jul 28, 2022
1 parent e1288ce commit c4ffccc
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions facebook_scraper/page_iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,7 @@ def __init__(self, response: Response):

def get_page(self) -> Page:
# Select only elements that have the data-ft attribute
return self._get_page(
'[data-ft*="top_level_post_id"]:not([data-sigil="m-see-translate-link"])', 'article'
)
return self._get_page('article[data-ft*="top_level_post_id"]', 'article')

def get_raw_page(self) -> RawPage:
return self.html
Expand Down Expand Up @@ -206,6 +204,16 @@ def _parse_json(self):
def _get_page(self, selection, selection_name) -> Page:
raw_page = self.get_raw_page()
raw_posts = raw_page.find(selection)
for post in raw_posts:
if not post.find("footer"):
# Due to malformed HTML served by Facebook, lxml might misinterpret where the footer should go in article elements
# If we limit the parsing just to the section element, it fixes it
# Please forgive me for parsing HTML with regex
logger.warning(f"No footer in article - reparsing HTML within <section> element")
html = re.search(r'<section.+?>(.+)</section>', raw_page.html).group(1)
raw_page = utils.make_html_element(html=html)
raw_posts = raw_page.find(selection)
break

if not raw_posts:
logger.warning(
Expand Down

0 comments on commit c4ffccc

Please sign in to comment.