From f2ba0704aeb36b6ea6ec3a4b8b2ef5d83742916f Mon Sep 17 00:00:00 2001 From: "Seongrok.lee" <55491354+argon1025@users.noreply.github.com> Date: Wed, 21 Feb 2024 22:15:01 +0900 Subject: [PATCH] =?UTF-8?q?#59=20=EC=84=9C=EC=9A=B8=EC=98=A4=EB=B9=A0=20?= =?UTF-8?q?=ED=8C=8C=EC=8B=B1=20=EA=B8=B0=EB=8A=A5=EC=B6=94=EA=B0=80=20=20?= =?UTF-8?q?(#60)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: 서울오빠 파싱기능 추가 * feat: 서울오빠 파싱 처리 이벤트 등록 * feat: 캠페인 조회 시 서울오빠 검색 가능하도록 추가 * fix: 최신순 게시글 파싱하도록 URL 수정 --- .../type/campaign.service.interface.ts | 2 + .../parsing-event/parser/seoulouba.parser.ts | 275 ++++++++++++++++++ .../parser/type/seoulouba.parser.interface.ts | 61 ++++ .../parsing-event/parsing-event.batch.ts | 10 + .../parsing-event/parsing-event.module.ts | 3 + .../type/parsing-event.interface.ts | 2 + 6 files changed, 353 insertions(+) create mode 100644 src/library/parsing-event/parser/seoulouba.parser.ts create mode 100644 src/library/parsing-event/parser/type/seoulouba.parser.interface.ts diff --git a/src/campaign/type/campaign.service.interface.ts b/src/campaign/type/campaign.service.interface.ts index 686e794..4bb3fee 100644 --- a/src/campaign/type/campaign.service.interface.ts +++ b/src/campaign/type/campaign.service.interface.ts @@ -118,5 +118,7 @@ export namespace ICampaignService { export enum ResourceProvider { /** 디너의 여왕 */ DINNER_QUEEN = 'DINNER_QUEEN', + /** 서울 오빠 */ + SEOUL_OUBA = 'SEOUL_OUBA', } } diff --git a/src/library/parsing-event/parser/seoulouba.parser.ts b/src/library/parsing-event/parser/seoulouba.parser.ts new file mode 100644 index 0000000..79cfcdc --- /dev/null +++ b/src/library/parsing-event/parser/seoulouba.parser.ts @@ -0,0 +1,275 @@ +/* eslint-disable @typescript-eslint/no-loop-func */ +import { HttpService } from '@nestjs/axios'; +import { Injectable } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import { lastValueFrom } from 'rxjs'; +import * as cheerio from 'cheerio'; +import { DateTime } from 'luxon'; +import puppeteer from 'puppeteer'; +import { setTimeout } from 'timers/promises'; +import { ParseEventLogger } from '../../custom-logger/parse-event-logger/parse-event.logger'; +import { PrismaService } from '../../prisma/prisma.service'; +import { ISeouloubaParser } from './type/seoulouba.parser.interface'; +import { chunkArray } from '../../utils/chunkArray'; + +@Injectable() +export class SeouloubaParser implements ISeouloubaParser.Base { + private readonly NODE_ENV = this.configService.getOrThrow('NODE_ENV'); + + constructor( + private readonly httpService: HttpService, + private readonly prismaService: PrismaService, + private readonly logger: ParseEventLogger, + private readonly configService: ConfigService, + ) {} + + async runWorker(options: ISeouloubaParser.RunWorkerOptions): Promise { + const { postIdList, eventId } = options; + const chunkSize = 1; + const requestDelay = 1000; // 상세 정보 요청 후 대기 시간 + this.logger.log(`작업 시작`, 'SeouloubaParser', eventId); + + // SECTION: 요청한 id가 없을 경우 디너퀸 전체 리스트를 가져온다 + const taskList: string[] = postIdList ?? []; + if (taskList.length === 0) { + try { + // 브라우저 부하로 해당 작업은 동시에 실행하지 않는다 + const visitCategory = await this.getAllIdList('https://www.seoulouba.co.kr/campaign/?cat=377&qq=&q=&q1=&q2=&ar1=&ar2=&&sort=latest'); + const shippingCategory = await this.getAllIdList('https://www.seoulouba.co.kr/campaign/?cat=383&qq=&q=&q1=&q2=&ar1=&ar2=&&sort=latest'); + const reporterCategory = await this.getAllIdList('https://www.seoulouba.co.kr/campaign/?cat=448&qq=&q=&q1=&q2=&ar1=&ar2=&&sort=latest'); + const reviewCategory = await this.getAllIdList('https://www.seoulouba.co.kr/campaign/?cat=449&qq=&q=&q1=&q2=&ar1=&ar2=&&sort=latest'); + // 중복 제거 + taskList.push(...Array.from(new Set([...visitCategory, ...shippingCategory, ...reporterCategory, ...reviewCategory]))); + } catch (error) { + this.logger.error(`전체 리스트를 가져오는데 실패했습니다`, error, 'SeouloubaParser', eventId); + } + this.logger.log(`갱신 대기 리스트 ${taskList.length}개`, 'SeouloubaParser'); + } + + // SECTION: 리스트를 chunkSize만큼 나눠서 상세정보를 요청 후 저장한다 + let successCount = 0; + let failedCount = 0; + const chunkedTaskList = chunkArray(chunkSize, taskList); + for (const ids of chunkedTaskList) { + // 상세 정보를 가져와서 저장한다 + const taskResult = await Promise.allSettled( + ids.map(async (id) => { + const detail = await this.getDetailById(id); + await this.upsertCampaign(detail); + }), + ); + + // 작업 결과를 확인해서 실패한 작업이 있을 경우 작업을 중단한다 + const failedTask = taskResult.filter((result) => result.status === 'rejected') as PromiseRejectedResult[]; + const successTask = taskResult.filter((result) => result.status === 'fulfilled') as PromiseFulfilledResult[]; + successCount += successTask.length; + if (failedTask.length > 0) { + failedCount += failedTask.length; + failedTask.forEach((failTask) => { + this.logger.error(`처리 실패`, failTask.reason, 'SeouloubaParser', eventId); + }); + break; + } + + // 대기 시간을 준다 + await setTimeout(requestDelay); + } + + this.logger.log( + `작업 종료 총 ${taskList.length}건, 완료 ${successCount}건, 실패 ${failedCount}건, 보류 ${taskList.length - successCount - failedCount}건`, + 'SeouloubaParser', + eventId, + ); + + return { total: taskList.length, successCount, failedCount }; + } + + async getAllIdList(url: string): Promise { + const browser = await puppeteer.launch({ + headless: 'new', + args: ['--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox'], + ...(this.NODE_ENV === 'prod' ? { executablePath: '/usr/bin/chromium-browser' } : {}), + }); + const page = await browser.newPage(); + await page.goto(url, { + waitUntil: 'domcontentloaded', + timeout: 1200000, + }); + + // 모든 캠페인 게시글 로드 + const scrollDelay = 300; + while (true) { + // 모집 마감 게시글 체크 + const isRecruitmentClosed = await page.evaluate(() => { + const element = Array.from(document.querySelectorAll('.load_blind_box strong')); + return element.some((content) => content.textContent.includes('모집마감')); + }); + if (isRecruitmentClosed) break; + + // 다음 페이지 선택 + try { + await page.click('#list_more_btn'); + await page.waitForResponse((response) => { + return response.url().includes('/campaign/ajax/list.ajax.php') && response.status() === 200; + }); + } catch (error) { + break; + } + await page.waitForTimeout(scrollDelay); // 지연 시간 + } + + // 페이지 로드가 완료되면 캠페인 id를 추출 + const pageIds = await page.evaluate(() => { + const links = document.querySelectorAll('.load_campaign a.tum_img'); + const hrefs = Array.from(links) + .map((link) => link.getAttribute('href')) + .map((href) => href.match(/c=(\d+)/).pop()); + return hrefs; + }); + await browser.close(); + return pageIds; + } + + async getDetailById(id: string): Promise { + const requestUrl = `https://www.seoulouba.co.kr/campaign/?c=${id}`; + const detailResponse = await lastValueFrom(this.httpService.get(requestUrl)); + + const $ = cheerio.load(detailResponse.data); + + const title = this.getTitle($); + const thumbnail = this.getThumbnail($); + const address = this.getAddress($); + const category = this.getCategory($); + const { startedAt, endedAt, drawAt } = this.getDateTime($); + const { recruitCount, applyCount } = this.getRecruitment($); + + return { + id, + title, + thumbnail, + address, + category, + recruitCount, + applyCount, + originUrl: requestUrl, + startedAt, + endedAt, + drawAt, + }; + } + + /** + * 캠페인의 제목을 가져온다 + */ + private getTitle($: cheerio.CheerioAPI) { + return $('h2.tit_v2').text().trim(); + } + + /** + * 캠페인의 응모 시작일, 응모 종료일, 추첨일을 가져온다 + */ + private getDateTime($: cheerio.CheerioAPI) { + // 날짜 구문 추출 + + const [startedAt, endedAt] = $('.campaign_guide_li .period:eq(0)').text().trim().split(' ~ '); + const drawAt = $('.campaign_guide_li .period:eq(1)').text().trim(); + + return { + startedAt: DateTime.fromFormat(startedAt, 'yy-MM-dd').minus({ hours: 9 }).toJSDate(), + endedAt: DateTime.fromFormat(endedAt, 'yy-MM-dd').minus({ hours: 9 }).toJSDate(), + drawAt: DateTime.fromFormat(drawAt, 'yy-MM-dd').minus({ hours: 9 }).toJSDate(), + }; + } + + /** + * 캠페인의 썸네일을 가져온다 + */ + private getThumbnail($: cheerio.CheerioAPI) { + const imageUrl = $('.thumb.cam_image').css('background'); + return imageUrl.replace(/^url\(['"](.+)['"]\)/, '$1'); + } + + /** + * 캠페인의 모집인원, 신청인원을 가져온다 + */ + private getRecruitment($: cheerio.CheerioAPI) { + // "크리에이터"의 값을 파싱 + const creatorText = $('li#cam_apply button.tap_menu_btn span').text(); + + // 숫자를 추출 + const regex = /\d+/g; + const [applyCount, recruitCount] = creatorText.match(regex); + + return { + recruitCount: Number(recruitCount), + applyCount: Number(applyCount), + }; + } + + private getAddress($: cheerio.CheerioAPI): string { + const address = $('div.map_adress span.txt_short').text(); + + return address; + } + + /** + * 캠페인의 카테고리를 가져온다 + */ + private getCategory($: cheerio.CheerioAPI) { + const chTagText = $('.ch_tag').text(); + switch (chTagText) { + case '방문형': { + return '방문'; + } + case '배송형': { + return '배송'; + } + case '기자단(배포형)': { + return '기자단'; + } + case '서비스형': + case '구매평플러스': + case '구매평': + default: { + return '기타'; + } + } + } + + private async upsertCampaign(detail: ISeouloubaParser.GetDetailByIdResult) { + const { originUrl, title, category, thumbnail, address, recruitCount, applyCount, startedAt, endedAt, drawAt } = detail; + await this.prismaService.campaign.upsert({ + where: { duplicateId: `SEOUL_OUBA_${detail.id}` }, + create: { + originUrl, + title, + category, + thumbnail, + address, + recruitCount, + applyCount, + startedAt, + endedAt, + drawAt, + duplicateId: `SEOUL_OUBA_${detail.id}`, + resourceProvider: 'SEOUL_OUBA', + targetPlatforms: 'blog', + }, + update: { + originUrl, + title, + category, + thumbnail, + address, + recruitCount, + applyCount, + startedAt, + endedAt, + drawAt, + duplicateId: `SEOUL_OUBA_${detail.id}`, + resourceProvider: 'SEOUL_OUBA', + targetPlatforms: 'blog', + }, + }); + } +} diff --git a/src/library/parsing-event/parser/type/seoulouba.parser.interface.ts b/src/library/parsing-event/parser/type/seoulouba.parser.interface.ts new file mode 100644 index 0000000..eb6828f --- /dev/null +++ b/src/library/parsing-event/parser/type/seoulouba.parser.interface.ts @@ -0,0 +1,61 @@ +export const SEOULOUBA_PARSER = Symbol('SEOULOUBA_PARSER'); + +export namespace ISeouloubaParser { + export interface Base { + /** + * 캠페인 정보를 가져와서 저장한다 + */ + runWorker(options: RunWorkerOptions): Promise; + + /** + * 현재 진행중인 모든 캠페인 게시글 id 리스트를 가져온다 + */ + getAllIdList(url: string): Promise; + + /** + * 캠페인의 상세 정보를 가져온다 + */ + getDetailById(id: string): Promise; + } + + export interface RunWorkerOptions { + /** 특정 게시글만 상세정보를 업데이트 할 경우 */ + postIdList?: string[]; + /** 이벤트 아이디 */ + eventId: string; + } + + export interface RunWorkerResult { + /** 전체 작업 수 */ + total: number; + /** 성공한 작업 수 */ + successCount: number; + /** 실패한 작업 수 */ + failedCount: number; + } + + export type GetAllIdListResult = string[]; + + export interface GetDetailByIdResult { + /** 서울오빠 게시글 아이디 */ + id: string; + title: string; + thumbnail: string; + /** 캠페인 방문 주소 */ + address: string; + /** 캠페인 유형 (방문, 배송, 기자단, 기타) */ + category: string; + /** 캠페인 모집 인원 */ + recruitCount: number; + /** 캠페인 신청 인원 */ + applyCount: number; + /** 캠페인 주소 */ + originUrl: string; + /** 캠페인 신청 시작일 */ + startedAt: Date; + /** 캠페인 신청 종료일 */ + endedAt: Date; + /** 당첨자 발표일 */ + drawAt: Date; + } +} diff --git a/src/library/parsing-event/parsing-event.batch.ts b/src/library/parsing-event/parsing-event.batch.ts index 4bccf16..b367898 100644 --- a/src/library/parsing-event/parsing-event.batch.ts +++ b/src/library/parsing-event/parsing-event.batch.ts @@ -3,6 +3,7 @@ import { Cron, CronExpression } from '@nestjs/schedule'; import { ParseEventLogger } from '../custom-logger/parse-event-logger/parse-event.logger'; import { PARSING_EVENT_SERVICE, IParsingEventService } from './type/parsing-event.interface'; import { DINNER_QUEEN_PARSER, IDinnerQueenParser } from './parser/type/dinner-queen.parser.interface'; +import { ISeouloubaParser, SEOULOUBA_PARSER } from './parser/type/seoulouba.parser.interface'; @Injectable() export class ParsingEventWorkerBatch { @@ -13,6 +14,8 @@ export class ParsingEventWorkerBatch { private readonly parsingEventService: IParsingEventService.Base, @Inject(DINNER_QUEEN_PARSER) private readonly dinnerQueenParser: IDinnerQueenParser.Base, + @Inject(SEOULOUBA_PARSER) + private readonly seouloubaParser: ISeouloubaParser.Base, private readonly logger: ParseEventLogger, ) {} @@ -54,6 +57,13 @@ export class ParsingEventWorkerBatch { }); break; } + case IParsingEventService.EventType.SEOUL_OUBA: { + eventResult = await this.seouloubaParser.runWorker({ + eventId: event.id, + ...(event?.eventMessage?.targetId && { postIdList: [event.eventMessage.targetId] }), + }); + break; + } default: { this.logger.error(`지원하지 않는 이벤트 타입입니다`, event, 'parseEventWorker', event.id); eventResult.total = 1; diff --git a/src/library/parsing-event/parsing-event.module.ts b/src/library/parsing-event/parsing-event.module.ts index 1adaa0c..cabf03c 100644 --- a/src/library/parsing-event/parsing-event.module.ts +++ b/src/library/parsing-event/parsing-event.module.ts @@ -7,12 +7,15 @@ import { DinnerQueenParser } from './parser/dinner-queen.parser'; import { ParsingEventWorkerBatch } from './parsing-event.batch'; import { DINNER_QUEEN_PARSER } from './parser/type/dinner-queen.parser.interface'; import { CustomLoggerModule } from '../custom-logger/custom-logger.module'; +import { SEOULOUBA_PARSER } from './parser/type/seoulouba.parser.interface'; +import { SeouloubaParser } from './parser/seoulouba.parser'; @Module({ imports: [HttpModule, PrismaModule, CustomLoggerModule], providers: [ { provide: PARSING_EVENT_SERVICE, useClass: ParsingEventService }, { provide: DINNER_QUEEN_PARSER, useClass: DinnerQueenParser }, + { provide: SEOULOUBA_PARSER, useClass: SeouloubaParser }, ParsingEventWorkerBatch, ], exports: [PARSING_EVENT_SERVICE], diff --git a/src/library/parsing-event/type/parsing-event.interface.ts b/src/library/parsing-event/type/parsing-event.interface.ts index 39899e0..ceb8a83 100644 --- a/src/library/parsing-event/type/parsing-event.interface.ts +++ b/src/library/parsing-event/type/parsing-event.interface.ts @@ -67,6 +67,8 @@ export namespace IParsingEventService { export enum EventType { /** 디너의 여왕 */ DINNER_QUEEN = 'DINNER_QUEEN', + /** 서울오빠 */ + SEOUL_OUBA = 'SEOUL_OUBA', } export interface EventPayload {