From 494f10eaf58ba97cbab4c290e716d697b633ca74 Mon Sep 17 00:00:00 2001 From: MignonErasmus Date: Tue, 16 Jul 2024 15:56:03 +0200 Subject: [PATCH 1/7] initial Redis setup --- wee/package-lock.json | 186 +++++++++++++++++++++++++- wee/package.json | 4 + wee/webscraper/src/config.ts | 6 + wee/webscraper/src/scraper.module.ts | 25 ++++ wee/webscraper/src/scraper.service.ts | 26 +++- 5 files changed, 241 insertions(+), 6 deletions(-) create mode 100644 wee/webscraper/src/config.ts diff --git a/wee/package-lock.json b/wee/package-lock.json index a339113c9..63ef94ee4 100644 --- a/wee/package-lock.json +++ b/wee/package-lock.json @@ -9,7 +9,9 @@ "version": "0.0.0", "license": "MIT", "dependencies": { + "@nestjs/cache-manager": "^2.2.2", "@nestjs/common": "^10.0.2", + "@nestjs/config": "^3.2.3", "@nestjs/core": "^10.0.2", "@nestjs/platform-express": "^10.0.2", "@nestjs/swagger": "^7.3.1", @@ -30,6 +32,8 @@ "apexcharts": "^3.49.1", "axios": "^1.7.2", "blob-stream": "^0.1.3", + "cache-manager": "^5.7.3", + "cache-manager-redis-yet": "^5.1.3", "cheerio": "^1.0.0-rc.12", "clsx": "^2.1.1", "file-saver": "^2.0.5", @@ -3983,6 +3987,17 @@ "integrity": "sha512-9b8mPpKrfeGRuhFH5iO1iwCLeIIsV6+H1sRfxbkoGXIyQE2BTsPd9zqSqQJ+pv5sJ/hT5M1zvOFL02MnEezFug==", "license": "MIT" }, + "node_modules/@nestjs/cache-manager": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/@nestjs/cache-manager/-/cache-manager-2.2.2.tgz", + "integrity": "sha512-+n7rpU1QABeW2WV17Dl1vZCG3vWjJU1MaamWgZvbGxYE9EeCM0lVLfw3z7acgDTNwOy+K68xuQPoIMxD0bhjlA==", + "peerDependencies": { + "@nestjs/common": "^9.0.0 || ^10.0.0", + "@nestjs/core": "^9.0.0 || ^10.0.0", + "cache-manager": "<=5", + "rxjs": "^7.0.0" + } + }, "node_modules/@nestjs/common": { "version": "10.3.8", "resolved": "https://registry.npmjs.org/@nestjs/common/-/common-10.3.8.tgz", @@ -4012,6 +4027,31 @@ } } }, + "node_modules/@nestjs/config": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/@nestjs/config/-/config-3.2.3.tgz", + "integrity": "sha512-p6yv/CvoBewJ72mBq4NXgOAi2rSQNWx3a+IMJLVKS2uiwFCOQQuiIatGwq6MRjXV3Jr+B41iUO8FIf4xBrZ4/w==", + "dependencies": { + "dotenv": "16.4.5", + "dotenv-expand": "10.0.0", + "lodash": "4.17.21" + }, + "peerDependencies": { + "@nestjs/common": "^8.0.0 || ^9.0.0 || ^10.0.0", + "rxjs": "^7.1.0" + } + }, + "node_modules/@nestjs/config/node_modules/dotenv": { + "version": "16.4.5", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", + "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/@nestjs/core": { "version": "10.3.8", "resolved": "https://registry.npmjs.org/@nestjs/core/-/core-10.3.8.tgz", @@ -11456,6 +11496,64 @@ "react": "^16.8.0 || ^17.0.0-rc.1 || ^18.0.0" } }, + "node_modules/@redis/bloom": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@redis/bloom/-/bloom-1.2.0.tgz", + "integrity": "sha512-HG2DFjYKbpNmVXsa0keLHp/3leGJz1mjh09f2RLGGLQZzSHpkmZWuwJbAvo3QcRY8p80m5+ZdXZdYOSBLlp7Cg==", + "peerDependencies": { + "@redis/client": "^1.0.0" + } + }, + "node_modules/@redis/client": { + "version": "1.5.17", + "resolved": "https://registry.npmjs.org/@redis/client/-/client-1.5.17.tgz", + "integrity": "sha512-IPvU9A31qRCZ7lds/x+ksuK/UMndd0EASveAvCvEtFFKIZjZ+m/a4a0L7S28KEWoR5ka8526hlSghDo4Hrc2Hg==", + "dependencies": { + "cluster-key-slot": "1.1.2", + "generic-pool": "3.9.0", + "yallist": "4.0.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/@redis/client/node_modules/yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==" + }, + "node_modules/@redis/graph": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@redis/graph/-/graph-1.1.1.tgz", + "integrity": "sha512-FEMTcTHZozZciLRl6GiiIB4zGm5z5F3F6a6FZCyrfxdKOhFlGkiAqlexWMBzCi4DcRoyiOsuLfW+cjlGWyExOw==", + "peerDependencies": { + "@redis/client": "^1.0.0" + } + }, + "node_modules/@redis/json": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/@redis/json/-/json-1.0.6.tgz", + "integrity": "sha512-rcZO3bfQbm2zPRpqo82XbW8zg4G/w4W3tI7X8Mqleq9goQjAGLL7q/1n1ZX4dXEAmORVZ4s1+uKLaUOg7LrUhw==", + "peerDependencies": { + "@redis/client": "^1.0.0" + } + }, + "node_modules/@redis/search": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/@redis/search/-/search-1.1.6.tgz", + "integrity": "sha512-mZXCxbTYKBQ3M2lZnEddwEAks0Kc7nauire8q20oA0oA/LoA+E/b5Y5KZn232ztPb1FkIGqo12vh3Lf+Vw5iTw==", + "peerDependencies": { + "@redis/client": "^1.0.0" + } + }, + "node_modules/@redis/time-series": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/@redis/time-series/-/time-series-1.0.5.tgz", + "integrity": "sha512-IFjIgTusQym2B5IZJG3XKr5llka7ey84fw/NOYqESP5WUfQs9zz1ww/9+qoz4ka/S6KcGBodzlCeZ5UImKbscg==", + "peerDependencies": { + "@redis/client": "^1.0.0" + } + }, "node_modules/@rushstack/eslint-patch": { "version": "1.10.3", "resolved": "https://registry.npmjs.org/@rushstack/eslint-patch/-/eslint-patch-1.10.3.tgz", @@ -15012,6 +15110,48 @@ "node": ">= 0.8" } }, + "node_modules/cache-manager": { + "version": "5.7.3", + "resolved": "https://registry.npmjs.org/cache-manager/-/cache-manager-5.7.3.tgz", + "integrity": "sha512-Vp2gd2aDm/MXdEWD0FLdOflvcVj4rdJ1FFmPUeOKq+fuL7MEUcezbTWxQmVB1TTN5Ig92CabMfi5z+HyQwVg9A==", + "dependencies": { + "eventemitter3": "^5.0.1", + "lodash.clonedeep": "^4.5.0", + "lru-cache": "^10.2.2", + "promise-coalesce": "^1.1.2" + }, + "engines": { + "node": ">= 18" + } + }, + "node_modules/cache-manager-redis-yet": { + "version": "5.1.3", + "resolved": "https://registry.npmjs.org/cache-manager-redis-yet/-/cache-manager-redis-yet-5.1.3.tgz", + "integrity": "sha512-V/IcEBqNQkwlPz/p4AsJLCBFmv3KVasQPSuQZx4ykPCbBm3ybIDoqf6dXuOKaBwIk7CQwoNxTR8HJ5Bssv1iYg==", + "dependencies": { + "@redis/bloom": "^1.2.0", + "@redis/client": "^1.5.17", + "@redis/graph": "^1.1.1", + "@redis/json": "^1.0.6", + "@redis/search": "^1.1.6", + "@redis/time-series": "^1.0.5", + "cache-manager": "^5.7.2", + "redis": "^4.6.15" + }, + "engines": { + "node": ">= 18" + } + }, + "node_modules/cache-manager/node_modules/eventemitter3": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz", + "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==" + }, + "node_modules/cache-manager/node_modules/lru-cache": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", + "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==" + }, "node_modules/cachedir": { "version": "2.4.0", "resolved": "https://registry.npmjs.org/cachedir/-/cachedir-2.4.0.tgz", @@ -15488,6 +15628,14 @@ "node": ">=6" } }, + "node_modules/cluster-key-slot": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/cluster-key-slot/-/cluster-key-slot-1.1.2.tgz", + "integrity": "sha512-RMr0FhtfXemyinomL4hrWcYJxmX6deFdCxpJzhDttxgO1+bcCnkk+9drydLVDmAMG7NE6aN/fl4F7ucU/90gAA==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/co": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", @@ -17282,7 +17430,6 @@ "version": "10.0.0", "resolved": "https://registry.npmjs.org/dotenv-expand/-/dotenv-expand-10.0.0.tgz", "integrity": "sha512-GopVGCpVS1UKH75VKHGuQFqS1Gusej0z4FyQkPdwjil2gNIv+LNsqBlboOzpJFZKVT95GkCyWJbBSdFEFUWI2A==", - "dev": true, "license": "BSD-2-Clause", "engines": { "node": ">=12" @@ -20006,6 +20153,14 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/generic-pool": { + "version": "3.9.0", + "resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz", + "integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==", + "engines": { + "node": ">= 4" + } + }, "node_modules/gensync": { "version": "1.0.0-beta.2", "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", @@ -25026,6 +25181,11 @@ "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==", "license": "MIT" }, + "node_modules/lodash.clonedeep": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz", + "integrity": "sha512-H5ZhCF25riFd9uB5UCkVKo61m3S/xZk1x4wA6yp/L3RFP6Z/eHH1ymQcGLo7J3GMPfm0V/7m1tryHuGVxpqEBQ==" + }, "node_modules/lodash.debounce": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz", @@ -30559,6 +30719,14 @@ "node": ">=0.4.0" } }, + "node_modules/promise-coalesce": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/promise-coalesce/-/promise-coalesce-1.1.2.tgz", + "integrity": "sha512-zLaJ9b8hnC564fnJH6NFSOGZYYdzrAJn2JUUIwzoQb32fG2QAakpDNM+CZo1km6keXkRXRM+hml1BFAPVnPkxg==", + "engines": { + "node": ">=16" + } + }, "node_modules/prompts": { "version": "2.4.2", "resolved": "https://registry.npmjs.org/prompts/-/prompts-2.4.2.tgz", @@ -31108,6 +31276,22 @@ "node": ">=8" } }, + "node_modules/redis": { + "version": "4.6.15", + "resolved": "https://registry.npmjs.org/redis/-/redis-4.6.15.tgz", + "integrity": "sha512-2NtuOpMW3tnYzBw6S8mbXSX7RPzvVFCA2wFJq9oErushO2UeBkxObk+uvo7gv7n0rhWeOj/IzrHO8TjcFlRSOg==", + "workspaces": [ + "./packages/*" + ], + "dependencies": { + "@redis/bloom": "1.2.0", + "@redis/client": "1.5.17", + "@redis/graph": "1.1.1", + "@redis/json": "1.0.6", + "@redis/search": "1.1.6", + "@redis/time-series": "1.0.5" + } + }, "node_modules/reflect-metadata": { "version": "0.1.14", "resolved": "https://registry.npmjs.org/reflect-metadata/-/reflect-metadata-0.1.14.tgz", diff --git a/wee/package.json b/wee/package.json index c9c14dd78..83c0fb0f4 100644 --- a/wee/package.json +++ b/wee/package.json @@ -10,7 +10,9 @@ }, "private": true, "dependencies": { + "@nestjs/cache-manager": "^2.2.2", "@nestjs/common": "^10.0.2", + "@nestjs/config": "^3.2.3", "@nestjs/core": "^10.0.2", "@nestjs/platform-express": "^10.0.2", "@nestjs/swagger": "^7.3.1", @@ -31,6 +33,8 @@ "apexcharts": "^3.49.1", "axios": "^1.7.2", "blob-stream": "^0.1.3", + "cache-manager": "^5.7.3", + "cache-manager-redis-yet": "^5.1.3", "cheerio": "^1.0.0-rc.12", "clsx": "^2.1.1", "file-saver": "^2.0.5", diff --git a/wee/webscraper/src/config.ts b/wee/webscraper/src/config.ts new file mode 100644 index 000000000..cf4a8779a --- /dev/null +++ b/wee/webscraper/src/config.ts @@ -0,0 +1,6 @@ +export default () => ({ + redis: { + host: process.env.REDIS_HOST, + port: process.env.REDIS_PORT + }, +}); \ No newline at end of file diff --git a/wee/webscraper/src/scraper.module.ts b/wee/webscraper/src/scraper.module.ts index f8efbf339..0bebf8cf8 100644 --- a/wee/webscraper/src/scraper.module.ts +++ b/wee/webscraper/src/scraper.module.ts @@ -2,6 +2,10 @@ import { Module } from '@nestjs/common'; import { ScraperService } from './scraper.service'; import { ScraperController } from './scraper.controller'; +import { CacheModule } from '@nestjs/cache-manager'; +import { redisStore } from 'cache-manager-redis-yet'; +import { ConfigModule, ConfigService } from '@nestjs/config'; +import config from './config'; // Services import { RobotsService } from './robots/robots.service'; @@ -15,6 +19,27 @@ import { ScrapeAddressService } from './scrape-address/scrape-address.service'; import { ScreenshotService } from './screenshot-homepage/screenshot.service'; import { SeoAnalysisService } from './seo-analysis/seo-analysis.service'; @Module({ + imports: [ + ConfigModule.forRoot({ + load: [config], + isGlobal: true, + }), + CacheModule.registerAsync({ + isGlobal: true, + imports: [ConfigModule], + useFactory: async (config) => { + const store = await redisStore({ + ttl: 300 * 1000, // 5 minutes in cache + socket: { + host: config.get('redis.host'), + port: config.get('redis.port') + } + }) + return { store } + }, + inject: [ConfigService] + }) + ], controllers: [ScraperController], providers: [ ScraperService, diff --git a/wee/webscraper/src/scraper.service.ts b/wee/webscraper/src/scraper.service.ts index 9082f2251..904ace520 100644 --- a/wee/webscraper/src/scraper.service.ts +++ b/wee/webscraper/src/scraper.service.ts @@ -1,4 +1,5 @@ -import { Injectable } from '@nestjs/common'; +import { Inject, Injectable } from '@nestjs/common'; +import { Cache } from 'cache-manager'; // Services import { RobotsService } from './robots/robots.service'; @@ -12,7 +13,6 @@ import { ScrapeContactInfoService } from './scrape-contact-info/scrape-contact-i import { ScrapeAddressService } from './scrape-address/scrape-address.service'; import { SeoAnalysisService } from './seo-analysis/seo-analysis.service'; - // Models import { ErrorResponse, @@ -24,6 +24,7 @@ import { @Injectable() export class ScraperService { constructor( + @Inject('CACHE_MANAGER') private cacheManager: Cache, private readonly robotsService: RobotsService, private readonly metadataService: ScrapeMetadataService, private readonly scrapeStatusService: ScrapeStatusService, @@ -33,13 +34,26 @@ export class ScraperService { private readonly screenshotService: ScreenshotService, private readonly scrapeContactInfoService: ScrapeContactInfoService, private readonly scrapeAddressService: ScrapeAddressService, - private readonly seoAnalysisService: SeoAnalysisService - + private readonly seoAnalysisService: SeoAnalysisService, ) {} async scrape(url: string) { const start = performance.now(); + const cachedData:string = await this.cacheManager.get(url); + if (cachedData) { + const end = performance.now(); + const times = (end - start) / 1000; + console.log('CACHE HIT', times); + const dataFromCache = JSON.parse(cachedData); + + // update the time field of the object being returned from cache + dataFromCache.time = parseFloat(times.toFixed(3)); + return dataFromCache; + } + + console.log('CACHE MISS - SCRAPE'); + // eslint-disable-next-line @typescript-eslint/no-explicit-any const data = { url: '', @@ -169,8 +183,10 @@ export class ScraperService { const end = performance.now(); const time = (end - start) / 1000; - data.time = parseFloat(time.toFixed(2)); + data.time = parseFloat(time.toFixed(3)); + // set the data in the cache + await this.cacheManager.set(url, JSON.stringify(data)); return data; } From 6e9afa2dd91ba6e586766bb2586a0cfe388375a5 Mon Sep 17 00:00:00 2001 From: MignonErasmus Date: Wed, 17 Jul 2024 10:48:01 +0200 Subject: [PATCH 2/7] updated dockerfile to support redis --- wee/docker-compose.yaml | 10 ++++++++++ wee/webscraper/src/config.ts | 3 ++- wee/webscraper/src/scraper.module.ts | 3 ++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/wee/docker-compose.yaml b/wee/docker-compose.yaml index 8e8b6a7c5..c6b588a2b 100644 --- a/wee/docker-compose.yaml +++ b/wee/docker-compose.yaml @@ -1,6 +1,14 @@ version: '3.8' services: + redis-cache: + image: redis + ports: + - "6379:6379" + env_file: + - webscraper/.env + command: sh -c 'redis-server --requirepass "$$REDIS_PASSWORD"' + webscraper: build: context: . @@ -15,6 +23,8 @@ services: command: > sh -c "npx nx serve webscraper --skip-nx-cache --configuration=production --verbose" user: webscraper + depends_on: + - redis-cache frontend: build: diff --git a/wee/webscraper/src/config.ts b/wee/webscraper/src/config.ts index cf4a8779a..30fd0d2ba 100644 --- a/wee/webscraper/src/config.ts +++ b/wee/webscraper/src/config.ts @@ -1,6 +1,7 @@ export default () => ({ redis: { host: process.env.REDIS_HOST, - port: process.env.REDIS_PORT + port: process.env.REDIS_PORT, + password: process.env.REDIS_PASSWORD }, }); \ No newline at end of file diff --git a/wee/webscraper/src/scraper.module.ts b/wee/webscraper/src/scraper.module.ts index 0bebf8cf8..3de5453e9 100644 --- a/wee/webscraper/src/scraper.module.ts +++ b/wee/webscraper/src/scraper.module.ts @@ -33,7 +33,8 @@ import { SeoAnalysisService } from './seo-analysis/seo-analysis.service'; socket: { host: config.get('redis.host'), port: config.get('redis.port') - } + }, + password: config.get('redis.password'), }) return { store } }, From 59c8d82b93e7ba7edf974b375d5bba05f0b8d6a5 Mon Sep 17 00:00:00 2001 From: MignonErasmus Date: Wed, 17 Jul 2024 11:48:43 +0200 Subject: [PATCH 3/7] changed ttl to 1 hour --- wee/webscraper/src/scraper.module.ts | 2 +- wee/webscraper/src/scraper.service.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/wee/webscraper/src/scraper.module.ts b/wee/webscraper/src/scraper.module.ts index 3de5453e9..420b9b582 100644 --- a/wee/webscraper/src/scraper.module.ts +++ b/wee/webscraper/src/scraper.module.ts @@ -29,7 +29,7 @@ import { SeoAnalysisService } from './seo-analysis/seo-analysis.service'; imports: [ConfigModule], useFactory: async (config) => { const store = await redisStore({ - ttl: 300 * 1000, // 5 minutes in cache + ttl: 60 * 60 * 1000, // 60 minutes in cache socket: { host: config.get('redis.host'), port: config.get('redis.port') diff --git a/wee/webscraper/src/scraper.service.ts b/wee/webscraper/src/scraper.service.ts index 904ace520..85794c47d 100644 --- a/wee/webscraper/src/scraper.service.ts +++ b/wee/webscraper/src/scraper.service.ts @@ -48,7 +48,7 @@ export class ScraperService { const dataFromCache = JSON.parse(cachedData); // update the time field of the object being returned from cache - dataFromCache.time = parseFloat(times.toFixed(3)); + dataFromCache.time = parseFloat(times.toFixed(4)); return dataFromCache; } @@ -183,7 +183,7 @@ export class ScraperService { const end = performance.now(); const time = (end - start) / 1000; - data.time = parseFloat(time.toFixed(3)); + data.time = parseFloat(time.toFixed(4)); // set the data in the cache await this.cacheManager.set(url, JSON.stringify(data)); From dad7fe4b6b244cd41b28a6f3cae1e8bfa2598005 Mon Sep 17 00:00:00 2001 From: MignonErasmus Date: Thu, 18 Jul 2024 08:58:44 +0200 Subject: [PATCH 4/7] initial testing for the scraper service --- wee/webscraper/src/scraper.service.spec.ts | 78 ++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 wee/webscraper/src/scraper.service.spec.ts diff --git a/wee/webscraper/src/scraper.service.spec.ts b/wee/webscraper/src/scraper.service.spec.ts new file mode 100644 index 000000000..50b801b8f --- /dev/null +++ b/wee/webscraper/src/scraper.service.spec.ts @@ -0,0 +1,78 @@ +import { Test, TestingModule } from "@nestjs/testing"; +import { ScraperService } from "./scraper.service"; +import { Cache } from 'cache-manager'; +import { RobotsService } from './robots/robots.service'; +import { ScrapeMetadataService } from './scrape-metadata/scrape-metadata.service'; +import { ScrapeStatusService } from './scrape-status/scrape-status.service'; +import { IndustryClassificationService } from './industry-classification/industry-classification.service'; +import { ScrapeLogoService } from './scrape-logo/scrape-logo.service'; +import { ScrapeImagesService } from './scrape-images/scrape-images.service'; +import { ScreenshotService } from './screenshot-homepage/screenshot.service'; +import { ScrapeContactInfoService } from './scrape-contact-info/scrape-contact-info.service'; +import { ScrapeAddressService } from './scrape-address/scrape-address.service'; +import { SeoAnalysisService } from './seo-analysis/seo-analysis.service'; + +describe('ScraperService', () => { + let service: ScraperService; + + beforeEach(async () => { + const module: TestingModule = await Test.createTestingModule({ + providers: [ + ScraperService, + { + provide: 'CACHE_MANAGER', + useValue: { + get: jest.fn(), + set: jest.fn(), + } as Partial, + }, + { + provide: RobotsService, + useValue: {}, + }, + { + provide: ScrapeMetadataService, + useValue: {}, + }, + { + provide: ScrapeStatusService, + useValue: {}, + }, + { + provide: IndustryClassificationService, + useValue: {}, + }, + { + provide: ScrapeLogoService, + useValue: {}, + }, + { + provide: ScrapeImagesService, + useValue: {}, + }, + { + provide: ScreenshotService, + useValue: {}, + }, + { + provide: ScrapeContactInfoService, + useValue: {}, + }, + { + provide: ScrapeAddressService, + useValue: {}, + }, + { + provide: SeoAnalysisService, + useValue: {}, + }, + ] + }).compile(); + + service = module.get(ScraperService); + }); + + it('should be defined', () => { + expect(service).toBeDefined(); + }); +}); \ No newline at end of file From e5862367c39bdb0e733a6784c8b3692533714c7d Mon Sep 17 00:00:00 2001 From: MignonErasmus Date: Thu, 18 Jul 2024 09:22:53 +0200 Subject: [PATCH 5/7] cache hit test --- wee/webscraper/src/scraper.service.spec.ts | 50 +++++++++++++++++++++- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/wee/webscraper/src/scraper.service.spec.ts b/wee/webscraper/src/scraper.service.spec.ts index 50b801b8f..c9a52ced7 100644 --- a/wee/webscraper/src/scraper.service.spec.ts +++ b/wee/webscraper/src/scraper.service.spec.ts @@ -11,9 +11,16 @@ import { ScreenshotService } from './screenshot-homepage/screenshot.service'; import { ScrapeContactInfoService } from './scrape-contact-info/scrape-contact-info.service'; import { ScrapeAddressService } from './scrape-address/scrape-address.service'; import { SeoAnalysisService } from './seo-analysis/seo-analysis.service'; +import { + ErrorResponse, + RobotsResponse, + Metadata, + IndustryClassification, + } from './models/ServiceModels'; describe('ScraperService', () => { let service: ScraperService; + let cacheManager: Cache; beforeEach(async () => { const module: TestingModule = await Test.createTestingModule({ @@ -28,7 +35,9 @@ describe('ScraperService', () => { }, { provide: RobotsService, - useValue: {}, + useValue: { + readRobotsFile: jest.fn(), + }, }, { provide: ScrapeMetadataService, @@ -36,7 +45,9 @@ describe('ScraperService', () => { }, { provide: ScrapeStatusService, - useValue: {}, + useValue: { + scrapeStatus: jest.fn(), + }, }, { provide: IndustryClassificationService, @@ -70,9 +81,44 @@ describe('ScraperService', () => { }).compile(); service = module.get(ScraperService); + cacheManager = module.get('CACHE_MANAGER'); }); it('should be defined', () => { expect(service).toBeDefined(); }); + + it('should return cached data on cache hit', async () => { + const url = 'http://example.com'; + const cachedData = { + url, + time: 0, + domainStatus: '', + robots: null, + metadata: null, + industryClassification: null, + logo: '', + images: [], + slogan: '', + contactInfo: { emails: [], phones: [] }, + addresses: [], + screenshot: '', + seoAnalysis: null, + }; + + jest.spyOn(cacheManager, 'get').mockResolvedValue(JSON.stringify(cachedData)); + const result = await service.scrape(url); + + expect(cacheManager.get).toHaveBeenCalledWith(url); + expect(result.url).toBe(url); + + expect(result.time).toBeCloseTo(cachedData.time, 2); + + expect(result).toEqual(expect.objectContaining({ + ...cachedData, + time: result.time + })); + }); + + }); \ No newline at end of file From 5c46a5975e90a3308f4d72210b0feacb1f0d18a6 Mon Sep 17 00:00:00 2001 From: MignonErasmus Date: Thu, 18 Jul 2024 11:13:27 +0200 Subject: [PATCH 6/7] cache miss test --- wee/webscraper/src/scraper.service.spec.ts | 145 +++++++++++++++++++-- 1 file changed, 131 insertions(+), 14 deletions(-) diff --git a/wee/webscraper/src/scraper.service.spec.ts b/wee/webscraper/src/scraper.service.spec.ts index c9a52ced7..ff89a7bb4 100644 --- a/wee/webscraper/src/scraper.service.spec.ts +++ b/wee/webscraper/src/scraper.service.spec.ts @@ -11,16 +11,20 @@ import { ScreenshotService } from './screenshot-homepage/screenshot.service'; import { ScrapeContactInfoService } from './scrape-contact-info/scrape-contact-info.service'; import { ScrapeAddressService } from './scrape-address/scrape-address.service'; import { SeoAnalysisService } from './seo-analysis/seo-analysis.service'; -import { - ErrorResponse, - RobotsResponse, - Metadata, - IndustryClassification, - } from './models/ServiceModels'; +import { RobotsResponse, Metadata, IndustryClassification } from './models/ServiceModels'; describe('ScraperService', () => { let service: ScraperService; let cacheManager: Cache; + let robotsService: RobotsService; + let metadataService: ScrapeMetadataService; + let industryClassificationService: IndustryClassificationService; + let scrapeLogoService: ScrapeLogoService; + let imagesService: ScrapeImagesService; + let contactInfoService: ScrapeContactInfoService; + let addressService: ScrapeAddressService; + let screenshotService: ScreenshotService; + let seoAnalysisService: SeoAnalysisService; beforeEach(async () => { const module: TestingModule = await Test.createTestingModule({ @@ -41,7 +45,9 @@ describe('ScraperService', () => { }, { provide: ScrapeMetadataService, - useValue: {}, + useValue: { + scrapeMetadata: jest.fn() + }, }, { provide: ScrapeStatusService, @@ -51,37 +57,60 @@ describe('ScraperService', () => { }, { provide: IndustryClassificationService, - useValue: {}, + useValue: { + classifyIndustry: jest.fn(), + }, }, { provide: ScrapeLogoService, - useValue: {}, + useValue: { + scrapeLogo: jest.fn(), + }, }, { provide: ScrapeImagesService, - useValue: {}, + useValue: { + scrapeImages: jest.fn(), + }, }, { provide: ScreenshotService, - useValue: {}, + useValue: { + captureScreenshot: jest.fn(), + }, }, { provide: ScrapeContactInfoService, - useValue: {}, + useValue: { + scrapeContactInfo: jest.fn(), + }, }, { provide: ScrapeAddressService, - useValue: {}, + useValue: { + scrapeAddress: jest.fn(), + }, }, { provide: SeoAnalysisService, - useValue: {}, + useValue: { + seoAnalysis: jest.fn(), + }, }, ] }).compile(); service = module.get(ScraperService); cacheManager = module.get('CACHE_MANAGER'); + robotsService = module.get(RobotsService); + metadataService = module.get(ScrapeMetadataService); + industryClassificationService = module.get(IndustryClassificationService); + scrapeLogoService = module.get(ScrapeLogoService); + imagesService = module.get(ScrapeImagesService); + contactInfoService = module.get(ScrapeContactInfoService); + addressService = module.get(ScrapeAddressService); + screenshotService = module.get(ScreenshotService); + seoAnalysisService = module.get(SeoAnalysisService); }); it('should be defined', () => { @@ -120,5 +149,93 @@ describe('ScraperService', () => { })); }); + it('should scrape and cache data on cache miss', async () => { + const url = 'http://example.com'; + + jest.spyOn(cacheManager, 'get').mockResolvedValue(null); + + const cacheSetSpy = jest.spyOn(cacheManager, 'set').mockResolvedValue(undefined); + + jest.spyOn(robotsService, 'readRobotsFile').mockResolvedValue({ + baseUrl: url, + isUrlScrapable: true, + isBaseUrlAllowed: true, + allowedPaths: [], + disallowedPaths: [] + } as RobotsResponse); + + jest.spyOn(metadataService, 'scrapeMetadata').mockResolvedValue({ + title: 'Example Title', + description: 'Example Description', + keywords: 'example, keywords' + } as Metadata); + + jest.spyOn(industryClassificationService, 'classifyIndustry').mockResolvedValue({ + metadataClass: { + label: "E-commerce", + score: 95, + }, + domainClass: { + label: 'Unknown', + score: 0, + } + } as IndustryClassification); + + jest.spyOn(scrapeLogoService, 'scrapeLogo').mockResolvedValue('http://example.com/logo.png'); + jest.spyOn(imagesService, 'scrapeImages').mockResolvedValue(['image1.png', 'image2.svg']); + jest.spyOn(contactInfoService, 'scrapeContactInfo').mockResolvedValue({ + emails: ['email@gmail.com'], + phones: ['01234567892'], + socialLinks: [] + }) + jest.spyOn(addressService, 'scrapeAddress').mockResolvedValue({addresses: ['address 1', 'address 2']}); + jest.spyOn(screenshotService, 'captureScreenshot').mockResolvedValue({screenshot: 'screenshot.png'}); + jest.spyOn(seoAnalysisService, 'seoAnalysis').mockResolvedValue(null); + + const result = await service.scrape(url); + + expect(cacheManager.get).toHaveBeenCalledWith(url); + expect(result.url).toBe(url); + expect(result).toEqual(expect.objectContaining({ + url, + time: expect.any(Number), + domainStatus: undefined, + robots: { + baseUrl: url, + isUrlScrapable: true, + isBaseUrlAllowed: true, + allowedPaths: [], + disallowedPaths: [] + }, + metadata: { + title: 'Example Title', + description: 'Example Description', + keywords: 'example, keywords' + }, + industryClassification: { + metadataClass: { + label: "E-commerce", + score: 95, + }, + domainClass: { + label: 'Unknown', + score: 0, + } + }, + logo: 'http://example.com/logo.png', + images: ['image1.png', 'image2.svg'], + slogan: '', + contactInfo: { + emails: ['email@gmail.com'], + phones: ['01234567892'], + socialLinks: [] + }, + addresses: ['address 1', 'address 2'], + screenshot: 'screenshot.png', + seoAnalysis: null, + })); + + expect(cacheSetSpy).toHaveBeenCalledWith(url, JSON.stringify(result)); + }); }); \ No newline at end of file From a0653f5cba557dac3560d779574e7e3cfc9e531a Mon Sep 17 00:00:00 2001 From: MignonErasmus Date: Thu, 18 Jul 2024 23:32:56 +0200 Subject: [PATCH 7/7] got the integration tests to work - mocking redis - hopefully --- wee/package-lock.json | 71 ++++++++++++++++++++++++++ wee/package.json | 1 + wee/webscraper/src/integration.test.ts | 38 +++++++++++++- 3 files changed, 108 insertions(+), 2 deletions(-) diff --git a/wee/package-lock.json b/wee/package-lock.json index 63ef94ee4..93f76b046 100644 --- a/wee/package-lock.json +++ b/wee/package-lock.json @@ -38,6 +38,7 @@ "clsx": "^2.1.1", "file-saver": "^2.0.5", "framer-motion": "^11.2.6", + "ioredis": "^5.4.1", "jest-styled-components": "^7.2.0", "jsdom": "^24.1.0", "jspdf": "^2.5.1", @@ -3041,6 +3042,11 @@ "@swc/helpers": "^0.5.0" } }, + "node_modules/@ioredis/commands": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@ioredis/commands/-/commands-1.2.0.tgz", + "integrity": "sha512-Sx1pU8EM64o2BrqNpEO1CNLtKQwyhuXuqyfH7oGKCk+1a33d2r5saW8zNwm3j6BTExtjrv2BxTgzzkMwts6vGg==" + }, "node_modules/@isaacs/cliui": { "version": "8.0.2", "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", @@ -17155,6 +17161,14 @@ "node": ">=0.4.0" } }, + "node_modules/denque": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/denque/-/denque-2.1.0.tgz", + "integrity": "sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==", + "engines": { + "node": ">=0.10" + } + }, "node_modules/depd": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", @@ -21222,6 +21236,29 @@ "loose-envify": "^1.0.0" } }, + "node_modules/ioredis": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/ioredis/-/ioredis-5.4.1.tgz", + "integrity": "sha512-2YZsvl7jopIa1gaePkeMtd9rAcSjOOjPtpcLlOeusyO+XH2SK5ZcT+UCrElPP+WVIInh2TzeI4XW9ENaSLVVHA==", + "dependencies": { + "@ioredis/commands": "^1.1.1", + "cluster-key-slot": "^1.1.0", + "debug": "^4.3.4", + "denque": "^2.1.0", + "lodash.defaults": "^4.2.0", + "lodash.isarguments": "^3.1.0", + "redis-errors": "^1.2.0", + "redis-parser": "^3.0.0", + "standard-as-callback": "^2.1.0" + }, + "engines": { + "node": ">=12.22.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/ioredis" + } + }, "node_modules/ip-address": { "version": "9.0.5", "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-9.0.5.tgz", @@ -25192,6 +25229,11 @@ "integrity": "sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==", "license": "MIT" }, + "node_modules/lodash.defaults": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/lodash.defaults/-/lodash.defaults-4.2.0.tgz", + "integrity": "sha512-qjxPLHd3r5DnsdGacqOMU6pb/avJzdh9tFX2ymgoZE27BmjXrNy/y4LoaiTeAb+O3gL8AfpJGtqfX/ae2leYYQ==" + }, "node_modules/lodash.flattendeep": { "version": "4.4.0", "resolved": "https://registry.npmjs.org/lodash.flattendeep/-/lodash.flattendeep-4.4.0.tgz", @@ -25211,6 +25253,11 @@ "integrity": "sha512-z+Uw/vLuy6gQe8cfaFWD7p0wVv8fJl3mbzXh33RS+0oW2wvUqiRXiQ69gLWSLpgB5/6sU+r6BlQR0MBILadqTQ==", "license": "MIT" }, + "node_modules/lodash.isarguments": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/lodash.isarguments/-/lodash.isarguments-3.1.0.tgz", + "integrity": "sha512-chi4NHZlZqZD18a0imDHnZPrDeBbTtVN7GXMwuGdRH9qotxAjYs3aVLKc7zNOG9eddR5Ksd8rvFEBc9SsggPpg==" + }, "node_modules/lodash.isequal": { "version": "4.5.0", "resolved": "https://registry.npmjs.org/lodash.isequal/-/lodash.isequal-4.5.0.tgz", @@ -31292,6 +31339,25 @@ "@redis/time-series": "1.0.5" } }, + "node_modules/redis-errors": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/redis-errors/-/redis-errors-1.2.0.tgz", + "integrity": "sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w==", + "engines": { + "node": ">=4" + } + }, + "node_modules/redis-parser": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-3.0.0.tgz", + "integrity": "sha512-DJnGAeenTdpMEH6uAJRK/uiyEIH9WVsUmoLwzudwGJUwZPp80PDBWPHXSAGNPwNvIXAbe7MSUB1zQFugFml66A==", + "dependencies": { + "redis-errors": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, "node_modules/reflect-metadata": { "version": "0.1.14", "resolved": "https://registry.npmjs.org/reflect-metadata/-/reflect-metadata-0.1.14.tgz", @@ -32550,6 +32616,11 @@ "node": ">=0.1.14" } }, + "node_modules/standard-as-callback": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/standard-as-callback/-/standard-as-callback-2.1.0.tgz", + "integrity": "sha512-qoRRSyROncaz1z0mvYqIE4lCd9p2R90i6GxW3uZv5ucSu8tU7B5HXUP1gG8pVZsYNVaXjk8ClXHPttLyxAL48A==" + }, "node_modules/statuses": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz", diff --git a/wee/package.json b/wee/package.json index 83c0fb0f4..39a2ec1c4 100644 --- a/wee/package.json +++ b/wee/package.json @@ -39,6 +39,7 @@ "clsx": "^2.1.1", "file-saver": "^2.0.5", "framer-motion": "^11.2.6", + "ioredis": "^5.4.1", "jest-styled-components": "^7.2.0", "jsdom": "^24.1.0", "jspdf": "^2.5.1", diff --git a/wee/webscraper/src/integration.test.ts b/wee/webscraper/src/integration.test.ts index 77590c9f6..1bf2abfba 100644 --- a/wee/webscraper/src/integration.test.ts +++ b/wee/webscraper/src/integration.test.ts @@ -2,16 +2,50 @@ import { Test, TestingModule } from '@nestjs/testing'; import { HttpStatus, INestApplication } from '@nestjs/common'; import request from 'supertest'; import { ScraperModule } from './scraper.module'; +import { ConfigModule, ConfigService } from '@nestjs/config'; + +jest.mock('ioredis', () => { + class MockRedis {} + return { default: jest.fn().mockImplementation(() => new MockRedis()) }; +}); + +jest.mock('cache-manager-redis-yet', () => ({ + redisStore: jest.fn().mockImplementation(() => ({ + set: jest.fn(), + get: jest.fn(), + del: jest.fn(), + reset: jest.fn(), + keys: jest.fn(), + })), +})); describe('ScraperController', () => { let app: INestApplication; + let configService: ConfigService; beforeAll(async () => { const moduleFixture: TestingModule = await Test.createTestingModule({ - imports: [ScraperModule], - }).compile(); + imports: [ScraperModule, ConfigModule.forRoot()], + }) + .overrideProvider(ConfigService) + .useValue({ + get: jest.fn((key: string) => { + switch (key) { + case 'redis.host': + return 'localhost'; + case 'redis.port': + return 6379; + case 'redis.password': + return 'testPassword'; + default: + return null; + } + }), + }) + .compile(); app = moduleFixture.createNestApplication(); + configService = moduleFixture.get(ConfigService); await app.init(); }, 60000);