Skip to content

Commit

Permalink
Update HTML searcher for latest markup, and misc. fixes (#70)
Browse files Browse the repository at this point in the history
* Properly wait for individual ads to scrape when searching with no delay

* Use latest URL format for image upsizing

* Use latest result page markup for HTML searcher

* Only remove one property in each 'invalid scrape response' test case

* Increment package version
  • Loading branch information
mwpenny authored Sep 13, 2023
1 parent becfec9 commit 9fc35f7
Show file tree
Hide file tree
Showing 10 changed files with 235 additions and 281 deletions.
2 changes: 1 addition & 1 deletion lib/ad.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export class Ad extends AdInfo {
* (e.g., storing ad URLs entered by a user for delayed scraping).
*
* `Ad.isScraped()` returns `false` for `Ad` objects constructed in this
* way unless `scraped` is passed as `true`or they are subsequently scraped
* way unless `scraped` is passed as `true` or they are subsequently scraped
* by calling `Ad.scrape()`, which causes the scraper to replace the ad's
* information with what is found at its URL.
*
Expand Down
115 changes: 35 additions & 80 deletions lib/backends/html-searcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,100 +10,55 @@ import { Ad } from "../ad";
import { BANNED, HTML_REQUEST_HEADERS, POSSIBLE_BAD_MARKUP } from "../constants";
import { AdInfo } from "../scraper";
import { PageResults, ResolvedSearchParameters } from "../search";
import { getLargeImageURL } from "../helpers";

const KIJIJI_BASE_URL = "https://www.kijiji.ca";
const KIJIJI_SEARCH_URL = KIJIJI_BASE_URL + "/b-search.html";
const IMG_REGEX = /\/s\-l\d+\.jpg$/;
const LOCATION_REGEX = /(.+)(\/.*)$/;

/* Converts a date from a Kijiji ad result into a date object
(e.g., "< x hours ago", "yesterday", "dd/mm/yyyy") */
function dateFromRelativeDateString(dateString: string): Date {
if (dateString) {
dateString = dateString.toLowerCase().replace(/\//g, " ");

const split = dateString.split(" ");
const d = new Date();

if (split.length === 3) {
// dd/mm/yyyy format
d.setHours(0, 0, 0, 0);
d.setDate(parseInt(split[0]));
d.setMonth(parseInt(split[1]) - 1);
d.setFullYear(parseInt(split[2]));
return d;
} else if (split.length === 4) {
// "< x hours/minutes ago" format
const num = parseInt(split[1]);
const timeUnit = split[2];

if (timeUnit === "minutes") {
d.setMinutes(d.getMinutes() - num);
d.setSeconds(0, 0);
} else if (timeUnit === "hours") {
d.setHours(d.getHours() - num, 0, 0, 0);
}
return d;
} else if (dateString == "yesterday") {
d.setDate(d.getDate() - 1);
d.setHours(0, 0, 0, 0);
return d;
}
}
return new Date(NaN);
}

/* Extracts ad information from the HTML of a Kijiji ad results page */
function parseResultsHTML(html: string): Ad[] {
const adResults: Ad[] = [];
const $ = cheerio.load(html);

// Get info for each ad
const allAdElements = $(".regular-ad");
const filteredAdElements = allAdElements.not(".third-party");
if (html.trim().length === 0) {
return adResults;
}

filteredAdElements.each((_i, item) => {
const path = $(item).find("a.title").attr("href");
const url = KIJIJI_BASE_URL + path;
const info: Partial<AdInfo> = {
id: $(item).data("listing-id")?.toString() || "",

title: $(item).find("a.title").text().trim(),

image: (
// `data-src` contains the URL of the image to lazy load
//
// `src` starts off with a placeholder image and will
// remain if the ad has no image
$(item).find(".image img").data("src") || $(item).find(".image img").attr("src") || ""
).replace(IMG_REGEX, "/s-l2000.jpg"),

date: dateFromRelativeDateString(
// For some reason, some categories (like anything under
// SERVICES) use different markup than usual
//
// The string split is needed to handle:
//
// <td class="posted">
// Some date
// <br>
// Some location
// </td>
//
// AKA "Some date\nSome location"
($(item).find(".date-posted").text() || $(item).find(".posted").text()).trim().split("\n")[0]
),

// Pick a format, Kijiji
description: ($(item).find(".description > p").text() || $(item).find(".description").text()).trim()
};
// Kijiji is nice and gives us an object containing ad info
const resultJson = $("script#__NEXT_DATA__").text().trim();
if (!resultJson) {
throw new Error(`Kijiji result JSON not present. ${POSSIBLE_BAD_MARKUP}`);
}

if (!path) {
throw new Error(`Result ad has no URL. ${POSSIBLE_BAD_MARKUP}`);
const allAds: any[] | undefined = JSON.parse(resultJson)
.props
?.pageProps
?.listings;
if (allAds === undefined) {
throw new Error(`Result JSON could not be parsed. ${POSSIBLE_BAD_MARKUP}`);
}

// All non-sponsored ads
const filteredAds = allAds.filter(ad => ad.adSource === "ORGANIC");

for (const ad of filteredAds) {
if (!ad.seoUrl || !ad.id || !ad.title || !ad.activationDate) {
throw new Error(`Result ad could not be parsed. ${POSSIBLE_BAD_MARKUP}`);
}

const url = KIJIJI_BASE_URL + ad.seoUrl;
const info: Partial<AdInfo> = {
id: ad.id,
title: ad.title.trim(),
image: getLargeImageURL((ad.imageUrls || [])[0] || ""),
date: new Date(ad.activationDate),
description: (ad.description || "").trim()
};

adResults.push(new Ad(url, info));
});
}

return adResults;
}

Expand Down Expand Up @@ -154,7 +109,7 @@ export class HTMLSearcher {
})
.then(body => ({
pageResults: parseResultsHTML(body),
isLastPage: body.indexOf('"isLastPage":true') !== -1
isLastPage: body.indexOf("pagination-next-link") === -1
}));
}
}
4 changes: 2 additions & 2 deletions lib/backends/test/api-scraper.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ describe("Ad API scraper", () => {
it.each`
test | xml
${"Bad markup"} | ${"Bad markup"}
${"Missing id"} | ${createAdXML({})}
${"Missing title"} | ${createAdXML({ id: "123" })}
${"Missing ID"} | ${createAdXML({ title: "My ad title", date: new Date() })}
${"Missing title"} | ${createAdXML({ id: "123", date: new Date() })}
${"Missing date"} | ${createAdXML({ id: "123", title: "My ad title" })}
`("should fail to scrape invalid XML ($test)", async ({ xml }) => {
mockResponse(xml);
Expand Down
10 changes: 5 additions & 5 deletions lib/backends/test/api-searcher.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ describe("Search result API scraper", () => {
});

type MockAdInfo = {
url?: string;
id?: string;
title?: string;
date?: Date;
url: string;
id: string;
title: string;
date: Date;
};

const createAdXML = (info: MockAdInfo) => {
const createAdXML = (info: Partial<MockAdInfo>) => {
return `
<ad:ad ${info.id ? `id="${info.id}"` : ""}>
${info.url ? `<ad:link rel="self-public-website" href="${info.url}"></ad:link>` : ""}
Expand Down
4 changes: 2 additions & 2 deletions lib/backends/test/html-scraper.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ describe("Ad HTML scraper", () => {
${"Missing config property"} | ${createAdHTML({ abc: 123 })}
${"Missing adInfo property"} | ${createAdHTML({ config: {} })}
${"Missing VIP property"} | ${createAdHTML({ config: { adInfo: {} } })}
${"Missing ID"} | ${createAdHTML({ config: { adInfo: {}, VIP: {} } })}
${"Missing title"} | ${createAdHTML({ config: { adInfo: {}, VIP: { adId: 1234 } } })}
${"Missing ID"} | ${createAdHTML({ config: { adInfo: { title: "Test" }, VIP: { sortingDate: 0 } } })}
${"Missing title"} | ${createAdHTML({ config: { adInfo: {}, VIP: { adId: 1234, sortingDate: 0 } } })}
${"Missing date"} | ${createAdHTML({ config: { adInfo: { title: "Test" }, VIP: { adId: 1234 } } })}
`("should fail to scrape invalid HTML ($test)", async ({ html }) => {
mockResponse(html);
Expand Down
Loading

0 comments on commit 9fc35f7

Please sign in to comment.