Skip to content

Commit

Permalink
Init parsing item details.
Browse files Browse the repository at this point in the history
  • Loading branch information
kirkbyers committed Mar 29, 2024
1 parent c9b8153 commit 933fa1e
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 6 deletions.
31 changes: 25 additions & 6 deletions src/bin/sm_scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,45 @@ async fn main() {
.get_url("https://www.sweetmarias.com/green-coffee.html?product_list_limit=all&sm_status=1")
.await
.expect("Failed to get URL");

let links = scraper.parse_directory_html(&res);
println!("Found {} links", links.len());
sleep(Duration::from_secs(5));
for (i, link) in links.iter().enumerate() {
for (i, link) in vec![links[0].clone()].iter().enumerate() {
println!("Scraping link #{}: {}", i, link);
let item_html = scraper
.get_url(link.as_str())
.await
.expect("Failed to get URL");
let item_text = scraper.strip_html_tags(&item_html);

let item_text = scraper.strip_html_tags(&item_html);
let id = Uuid::new_v4();
let now = Utc::now();
let table_data = scraper.sm_item_listing_to_details(&item_html);
let region: String = table_data
.get("region")
.unwrap_or(&String::new())
.to_string();
let processing: String = table_data
.get("processing")
.unwrap_or(&String::new())
.to_string();

// Save the scraped item to the database
let mut stmt = conn
.prepare(INSERT_QUERY)
.await
.expect("Failed to prepare query.");

match stmt
.execute((id.to_string(), link.clone(), item_text, now.to_rfc3339()))
.execute((
id.to_string(),
link.clone(),
item_text,
now.to_rfc3339(),
item_html,
region,
processing,
))
.await
{
Ok(_) => println!("Scraped and saved item #{}", i),
Expand All @@ -46,6 +65,6 @@ async fn main() {
}

const INSERT_QUERY: &str = r#"
INSERT INTO sm_scrapes (id, url, content, scraped_at)
VALUES (?1, ?2, ?3, ?4);
INSERT INTO sm_scrapes (id, url, content, scraped_at, original, region, processing)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7);
"#;
28 changes: 28 additions & 0 deletions src/services/scraper.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::collections::HashMap;

use reqwest::header::{
HeaderMap, ACCEPT, ACCEPT_LANGUAGE, CONNECTION, COOKIE, DNT, HOST, UPGRADE_INSECURE_REQUESTS,
USER_AGENT,
Expand Down Expand Up @@ -73,6 +75,32 @@ impl Scraper {
links
}

pub fn sm_item_listing_to_details(&self, html: &str) -> HashMap<String, String> {
let document = Html::parse_document(html);

let table_selectors: HashMap<_, _> = [
("region", "td[data-th='Region']"),
("processing", "td[data-th='Processing']"),
]
.iter()
.cloned()
.collect();

let table_results: HashMap<String, String> = table_selectors
.iter()
.map(|(key, selector)| {
let selector = Selector::parse(selector).unwrap();
let mut result = String::new();
for element in document.select(&selector) {
result.push_str(&element.inner_html().trim());
}
(String::from(*key), result)
})
.collect();

table_results
}

pub fn strip_html_tags(&self, html: &str) -> String {
let mut result = String::new();
let mut copy = String::new();
Expand Down

0 comments on commit 933fa1e

Please sign in to comment.