Skip to content

Commit

Permalink
Update sm scraper.
Browse files Browse the repository at this point in the history
  • Loading branch information
kirkbyers committed Mar 30, 2024
1 parent 933fa1e commit 22aafb3
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 15 deletions.
70 changes: 57 additions & 13 deletions src/bin/sm_scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ use chrono::Utc;
use uuid::Uuid;
use zero2prod::{configuration::get_configuration, db::local_db, services::scraper};

macro_rules! unwrap_table_data {
($table_data:expr, $key:expr) => {
$table_data.get($key).unwrap_or(&String::new()).to_string()
};
}

#[tokio::main]
async fn main() {
let config =
Expand All @@ -20,7 +26,7 @@ async fn main() {
let links = scraper.parse_directory_html(&res);
println!("Found {} links", links.len());
sleep(Duration::from_secs(5));
for (i, link) in vec![links[0].clone()].iter().enumerate() {
for (i, link) in links.iter().enumerate() {
println!("Scraping link #{}: {}", i, link);
let item_html = scraper
.get_url(link.as_str())
Expand All @@ -31,30 +37,48 @@ async fn main() {
let id = Uuid::new_v4();
let now = Utc::now();
let table_data = scraper.sm_item_listing_to_details(&item_html);
let region: String = table_data
.get("region")
.unwrap_or(&String::new())
.to_string();
let processing: String = table_data
.get("processing")
.unwrap_or(&String::new())
.to_string();
let region: String = unwrap_table_data!(table_data, "region");
let processing: String = unwrap_table_data!(table_data, "processing");
let drying: String = unwrap_table_data!(table_data, "drying");
let arrival: String = unwrap_table_data!(table_data, "arrival");
let lot_size: String = unwrap_table_data!(table_data, "lot_size");
let bag_size: String = unwrap_table_data!(table_data, "bag_size");
let packaging: String = unwrap_table_data!(table_data, "packaging");
let farm_gate: String = unwrap_table_data!(table_data, "farm_gate");
let cultivar_detail: String = unwrap_table_data!(table_data, "cultivar_detail");
let grade: String = unwrap_table_data!(table_data, "grade");
let appearance: String = unwrap_table_data!(table_data, "appearance");
let roast_rec: String = unwrap_table_data!(table_data, "roast_rec");
let coffee_type: String = unwrap_table_data!(table_data, "coffee_type");
let spro_rec: String = unwrap_table_data!(table_data, "spro_rec");

// Save the scraped item to the database
let mut stmt = conn
.prepare(INSERT_QUERY)
.await
.expect("Failed to prepare query.");
match stmt
.execute((
.execute(vec![
id.to_string(),
link.clone(),
item_text,
now.to_rfc3339(),
item_html,
region,
processing,
))
drying,
arrival,
lot_size,
bag_size,
packaging,
farm_gate,
cultivar_detail,
grade,
appearance,
roast_rec,
coffee_type,
spro_rec,
])
.await
{
Ok(_) => println!("Scraped and saved item #{}", i),
Expand All @@ -65,6 +89,26 @@ async fn main() {
}

const INSERT_QUERY: &str = r#"
INSERT INTO sm_scrapes (id, url, content, scraped_at, original, region, processing)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7);
INSERT INTO sm_scrapes (
id,
url,
content,
scraped_at,
original,
region,
processing,
drying,
arrival,
lot_size,
bag_size,
packaging,
farm_gate,
cultivar_detail,
grade,
appearance,
roast_rec,
coffee_type,
spro_rec
)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17, ?18, ?19);
"#;
2 changes: 1 addition & 1 deletion src/models/sm_scrape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ pub async fn get_page(
.collect()
});

let mut new_row = SMScrapeRow::default();
let mut new_row = SMScrapeRow::new();
new_row.id = id;
new_row.url = url;
new_row.content = content;
Expand Down
14 changes: 13 additions & 1 deletion src/services/scraper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,18 @@ impl Scraper {
let table_selectors: HashMap<_, _> = [
("region", "td[data-th='Region']"),
("processing", "td[data-th='Processing']"),
("drying", "td[data-th='Drying Method']"),
("arrival", "td[data-th='Arrival date']"),
("lot_size", "td[data-th='Lot size']"),
("bag_size", "td[data-th='Bag size']"),
("packaging", "td[data-th='Packaging']"),
("farm_gate", "td[data-th='Farm Gate']"),
("cultivar_detail", "td[data-th='Cultivar Detail']"),
("grade", "td[data-th='Grade']"),
("appearance", "td[data-th='Appearance']"),
("roast_rec", "td[data-th='Roast Recommendations']"),
("coffee_type", "td[data-th='Type']"),
("spro_rec", "td[data-th='Recommended for Espresso']"),
]
.iter()
.cloned()
Expand All @@ -92,7 +104,7 @@ impl Scraper {
let selector = Selector::parse(selector).unwrap();
let mut result = String::new();
for element in document.select(&selector) {
result.push_str(&element.inner_html().trim());
result.push_str(element.inner_html().trim());
}
(String::from(*key), result)
})
Expand Down

0 comments on commit 22aafb3

Please sign in to comment.