Skip to content

Commit

Permalink
Trying another tokenizer fix. #1964
Browse files Browse the repository at this point in the history
  • Loading branch information
dessalines committed Dec 21, 2021
1 parent 8ba0f0c commit 855515f
Showing 1 changed file with 22 additions and 4 deletions.
26 changes: 22 additions & 4 deletions crates/utils/src/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use reqwest::Client;
use serde::{Deserialize, Serialize};
use std::future::Future;
use thiserror::Error;
use tracing::error;
use tracing::{error, info};
use url::Url;
use webpage::HTML;

Expand Down Expand Up @@ -58,19 +58,37 @@ pub struct SiteMetadata {

/// Fetches the post link html tags (like title, description, image, etc)
pub async fn fetch_site_metadata(client: &Client, url: &Url) -> Result<SiteMetadata, LemmyError> {
info!("Fetching site metadata for url: {}", url);
let response = client.get(url.as_str()).send().await?;

let html = response
.text()
// Can't use .text() here, because it only checks the content header, not the actual bytes
// https://github.com/LemmyNet/lemmy/issues/1964
let html_bytes = response
.bytes()
.await
.map_err(|e| RecvError(e.to_string()))?;
.map_err(|e| RecvError(e.to_string()))?
.to_vec();

let html = String::from_utf8_lossy(&html_bytes);

let tags = html_to_site_metadata(&html)?;

Ok(tags)
}

fn html_to_site_metadata(html: &str) -> Result<SiteMetadata, LemmyError> {
// Make sure the first line is doctype html
let first_line = html
.lines()
.into_iter()
.next()
.ok_or_else(|| anyhow!("No lines in html"))?
.to_lowercase();

if !first_line.starts_with("<!doctype html>") {
return Err(anyhow!("Site metadata page fetch is not DOCTYPE html",).into());
}

let page = HTML::from_string(html.to_string(), None)?;

let page_title = page.title;
Expand Down

0 comments on commit 855515f

Please sign in to comment.