Skip to content

Commit

Permalink
#59 서울오빠 파싱 기능추가 (#60)
Browse files Browse the repository at this point in the history
* feat: 서울오빠 파싱기능 추가

* feat: 서울오빠 파싱 처리 이벤트 등록

* feat: 캠페인 조회 시 서울오빠 검색 가능하도록 추가

* fix: 최신순 게시글 파싱하도록 URL 수정
  • Loading branch information
argon1025 authored Feb 21, 2024
1 parent e6fdec4 commit f2ba070
Show file tree
Hide file tree
Showing 6 changed files with 353 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/campaign/type/campaign.service.interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,7 @@ export namespace ICampaignService {
export enum ResourceProvider {
/** 디너의 여왕 */
DINNER_QUEEN = 'DINNER_QUEEN',
/** 서울 오빠 */
SEOUL_OUBA = 'SEOUL_OUBA',
}
}
275 changes: 275 additions & 0 deletions src/library/parsing-event/parser/seoulouba.parser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
/* eslint-disable @typescript-eslint/no-loop-func */
import { HttpService } from '@nestjs/axios';
import { Injectable } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
import { lastValueFrom } from 'rxjs';
import * as cheerio from 'cheerio';
import { DateTime } from 'luxon';
import puppeteer from 'puppeteer';
import { setTimeout } from 'timers/promises';
import { ParseEventLogger } from '../../custom-logger/parse-event-logger/parse-event.logger';
import { PrismaService } from '../../prisma/prisma.service';
import { ISeouloubaParser } from './type/seoulouba.parser.interface';
import { chunkArray } from '../../utils/chunkArray';

@Injectable()
export class SeouloubaParser implements ISeouloubaParser.Base {
private readonly NODE_ENV = this.configService.getOrThrow<string>('NODE_ENV');

constructor(
private readonly httpService: HttpService,
private readonly prismaService: PrismaService,
private readonly logger: ParseEventLogger,
private readonly configService: ConfigService,
) {}

async runWorker(options: ISeouloubaParser.RunWorkerOptions): Promise<ISeouloubaParser.RunWorkerResult> {
const { postIdList, eventId } = options;
const chunkSize = 1;
const requestDelay = 1000; // 상세 정보 요청 후 대기 시간
this.logger.log(`작업 시작`, 'SeouloubaParser', eventId);

// SECTION: 요청한 id가 없을 경우 디너퀸 전체 리스트를 가져온다
const taskList: string[] = postIdList ?? [];
if (taskList.length === 0) {
try {
// 브라우저 부하로 해당 작업은 동시에 실행하지 않는다
const visitCategory = await this.getAllIdList('https://www.seoulouba.co.kr/campaign/?cat=377&qq=&q=&q1=&q2=&ar1=&ar2=&&sort=latest');
const shippingCategory = await this.getAllIdList('https://www.seoulouba.co.kr/campaign/?cat=383&qq=&q=&q1=&q2=&ar1=&ar2=&&sort=latest');
const reporterCategory = await this.getAllIdList('https://www.seoulouba.co.kr/campaign/?cat=448&qq=&q=&q1=&q2=&ar1=&ar2=&&sort=latest');
const reviewCategory = await this.getAllIdList('https://www.seoulouba.co.kr/campaign/?cat=449&qq=&q=&q1=&q2=&ar1=&ar2=&&sort=latest');
// 중복 제거
taskList.push(...Array.from(new Set([...visitCategory, ...shippingCategory, ...reporterCategory, ...reviewCategory])));
} catch (error) {
this.logger.error(`전체 리스트를 가져오는데 실패했습니다`, error, 'SeouloubaParser', eventId);
}
this.logger.log(`갱신 대기 리스트 ${taskList.length}개`, 'SeouloubaParser');
}

// SECTION: 리스트를 chunkSize만큼 나눠서 상세정보를 요청 후 저장한다
let successCount = 0;
let failedCount = 0;
const chunkedTaskList = chunkArray(chunkSize, taskList);
for (const ids of chunkedTaskList) {
// 상세 정보를 가져와서 저장한다
const taskResult = await Promise.allSettled(
ids.map(async (id) => {
const detail = await this.getDetailById(id);
await this.upsertCampaign(detail);
}),
);

// 작업 결과를 확인해서 실패한 작업이 있을 경우 작업을 중단한다
const failedTask = taskResult.filter((result) => result.status === 'rejected') as PromiseRejectedResult[];
const successTask = taskResult.filter((result) => result.status === 'fulfilled') as PromiseFulfilledResult<any>[];
successCount += successTask.length;
if (failedTask.length > 0) {
failedCount += failedTask.length;
failedTask.forEach((failTask) => {
this.logger.error(`처리 실패`, failTask.reason, 'SeouloubaParser', eventId);
});
break;
}

// 대기 시간을 준다
await setTimeout(requestDelay);
}

this.logger.log(
`작업 종료 총 ${taskList.length}건, 완료 ${successCount}건, 실패 ${failedCount}건, 보류 ${taskList.length - successCount - failedCount}건`,
'SeouloubaParser',
eventId,
);

return { total: taskList.length, successCount, failedCount };
}

async getAllIdList(url: string): Promise<ISeouloubaParser.GetAllIdListResult> {
const browser = await puppeteer.launch({
headless: 'new',
args: ['--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox'],
...(this.NODE_ENV === 'prod' ? { executablePath: '/usr/bin/chromium-browser' } : {}),
});
const page = await browser.newPage();
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 1200000,
});

// 모든 캠페인 게시글 로드
const scrollDelay = 300;
while (true) {
// 모집 마감 게시글 체크
const isRecruitmentClosed = await page.evaluate(() => {
const element = Array.from(document.querySelectorAll('.load_blind_box strong'));
return element.some((content) => content.textContent.includes('모집마감'));
});
if (isRecruitmentClosed) break;

// 다음 페이지 선택
try {
await page.click('#list_more_btn');
await page.waitForResponse((response) => {
return response.url().includes('/campaign/ajax/list.ajax.php') && response.status() === 200;
});
} catch (error) {
break;
}
await page.waitForTimeout(scrollDelay); // 지연 시간
}

// 페이지 로드가 완료되면 캠페인 id를 추출
const pageIds = await page.evaluate(() => {
const links = document.querySelectorAll('.load_campaign a.tum_img');
const hrefs = Array.from(links)
.map((link) => link.getAttribute('href'))
.map((href) => href.match(/c=(\d+)/).pop());
return hrefs;
});
await browser.close();
return pageIds;
}

async getDetailById(id: string): Promise<ISeouloubaParser.GetDetailByIdResult> {
const requestUrl = `https://www.seoulouba.co.kr/campaign/?c=${id}`;
const detailResponse = await lastValueFrom(this.httpService.get(requestUrl));

const $ = cheerio.load(detailResponse.data);

const title = this.getTitle($);
const thumbnail = this.getThumbnail($);
const address = this.getAddress($);
const category = this.getCategory($);
const { startedAt, endedAt, drawAt } = this.getDateTime($);
const { recruitCount, applyCount } = this.getRecruitment($);

return {
id,
title,
thumbnail,
address,
category,
recruitCount,
applyCount,
originUrl: requestUrl,
startedAt,
endedAt,
drawAt,
};
}

/**
* 캠페인의 제목을 가져온다
*/
private getTitle($: cheerio.CheerioAPI) {
return $('h2.tit_v2').text().trim();
}

/**
* 캠페인의 응모 시작일, 응모 종료일, 추첨일을 가져온다
*/
private getDateTime($: cheerio.CheerioAPI) {
// 날짜 구문 추출

const [startedAt, endedAt] = $('.campaign_guide_li .period:eq(0)').text().trim().split(' ~ ');
const drawAt = $('.campaign_guide_li .period:eq(1)').text().trim();

return {
startedAt: DateTime.fromFormat(startedAt, 'yy-MM-dd').minus({ hours: 9 }).toJSDate(),
endedAt: DateTime.fromFormat(endedAt, 'yy-MM-dd').minus({ hours: 9 }).toJSDate(),
drawAt: DateTime.fromFormat(drawAt, 'yy-MM-dd').minus({ hours: 9 }).toJSDate(),
};
}

/**
* 캠페인의 썸네일을 가져온다
*/
private getThumbnail($: cheerio.CheerioAPI) {
const imageUrl = $('.thumb.cam_image').css('background');
return imageUrl.replace(/^url\(['"](.+)['"]\)/, '$1');
}

/**
* 캠페인의 모집인원, 신청인원을 가져온다
*/
private getRecruitment($: cheerio.CheerioAPI) {
// "크리에이터"의 값을 파싱
const creatorText = $('li#cam_apply button.tap_menu_btn span').text();

// 숫자를 추출
const regex = /\d+/g;
const [applyCount, recruitCount] = creatorText.match(regex);

return {
recruitCount: Number(recruitCount),
applyCount: Number(applyCount),
};
}

private getAddress($: cheerio.CheerioAPI): string {
const address = $('div.map_adress span.txt_short').text();

return address;
}

/**
* 캠페인의 카테고리를 가져온다
*/
private getCategory($: cheerio.CheerioAPI) {
const chTagText = $('.ch_tag').text();
switch (chTagText) {
case '방문형': {
return '방문';
}
case '배송형': {
return '배송';
}
case '기자단(배포형)': {
return '기자단';
}
case '서비스형':
case '구매평플러스':
case '구매평':
default: {
return '기타';
}
}
}

private async upsertCampaign(detail: ISeouloubaParser.GetDetailByIdResult) {
const { originUrl, title, category, thumbnail, address, recruitCount, applyCount, startedAt, endedAt, drawAt } = detail;
await this.prismaService.campaign.upsert({
where: { duplicateId: `SEOUL_OUBA_${detail.id}` },
create: {
originUrl,
title,
category,
thumbnail,
address,
recruitCount,
applyCount,
startedAt,
endedAt,
drawAt,
duplicateId: `SEOUL_OUBA_${detail.id}`,
resourceProvider: 'SEOUL_OUBA',
targetPlatforms: 'blog',
},
update: {
originUrl,
title,
category,
thumbnail,
address,
recruitCount,
applyCount,
startedAt,
endedAt,
drawAt,
duplicateId: `SEOUL_OUBA_${detail.id}`,
resourceProvider: 'SEOUL_OUBA',
targetPlatforms: 'blog',
},
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
export const SEOULOUBA_PARSER = Symbol('SEOULOUBA_PARSER');

export namespace ISeouloubaParser {
export interface Base {
/**
* 캠페인 정보를 가져와서 저장한다
*/
runWorker(options: RunWorkerOptions): Promise<RunWorkerResult>;

/**
* 현재 진행중인 모든 캠페인 게시글 id 리스트를 가져온다
*/
getAllIdList(url: string): Promise<GetAllIdListResult>;

/**
* 캠페인의 상세 정보를 가져온다
*/
getDetailById(id: string): Promise<GetDetailByIdResult>;
}

export interface RunWorkerOptions {
/** 특정 게시글만 상세정보를 업데이트 할 경우 */
postIdList?: string[];
/** 이벤트 아이디 */
eventId: string;
}

export interface RunWorkerResult {
/** 전체 작업 수 */
total: number;
/** 성공한 작업 수 */
successCount: number;
/** 실패한 작업 수 */
failedCount: number;
}

export type GetAllIdListResult = string[];

export interface GetDetailByIdResult {
/** 서울오빠 게시글 아이디 */
id: string;
title: string;
thumbnail: string;
/** 캠페인 방문 주소 */
address: string;
/** 캠페인 유형 (방문, 배송, 기자단, 기타) */
category: string;
/** 캠페인 모집 인원 */
recruitCount: number;
/** 캠페인 신청 인원 */
applyCount: number;
/** 캠페인 주소 */
originUrl: string;
/** 캠페인 신청 시작일 */
startedAt: Date;
/** 캠페인 신청 종료일 */
endedAt: Date;
/** 당첨자 발표일 */
drawAt: Date;
}
}
10 changes: 10 additions & 0 deletions src/library/parsing-event/parsing-event.batch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { Cron, CronExpression } from '@nestjs/schedule';
import { ParseEventLogger } from '../custom-logger/parse-event-logger/parse-event.logger';
import { PARSING_EVENT_SERVICE, IParsingEventService } from './type/parsing-event.interface';
import { DINNER_QUEEN_PARSER, IDinnerQueenParser } from './parser/type/dinner-queen.parser.interface';
import { ISeouloubaParser, SEOULOUBA_PARSER } from './parser/type/seoulouba.parser.interface';

@Injectable()
export class ParsingEventWorkerBatch {
Expand All @@ -13,6 +14,8 @@ export class ParsingEventWorkerBatch {
private readonly parsingEventService: IParsingEventService.Base,
@Inject(DINNER_QUEEN_PARSER)
private readonly dinnerQueenParser: IDinnerQueenParser.Base,
@Inject(SEOULOUBA_PARSER)
private readonly seouloubaParser: ISeouloubaParser.Base,
private readonly logger: ParseEventLogger,
) {}

Expand Down Expand Up @@ -54,6 +57,13 @@ export class ParsingEventWorkerBatch {
});
break;
}
case IParsingEventService.EventType.SEOUL_OUBA: {
eventResult = await this.seouloubaParser.runWorker({
eventId: event.id,
...(event?.eventMessage?.targetId && { postIdList: [event.eventMessage.targetId] }),
});
break;
}
default: {
this.logger.error(`지원하지 않는 이벤트 타입입니다`, event, 'parseEventWorker', event.id);
eventResult.total = 1;
Expand Down
3 changes: 3 additions & 0 deletions src/library/parsing-event/parsing-event.module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@ import { DinnerQueenParser } from './parser/dinner-queen.parser';
import { ParsingEventWorkerBatch } from './parsing-event.batch';
import { DINNER_QUEEN_PARSER } from './parser/type/dinner-queen.parser.interface';
import { CustomLoggerModule } from '../custom-logger/custom-logger.module';
import { SEOULOUBA_PARSER } from './parser/type/seoulouba.parser.interface';
import { SeouloubaParser } from './parser/seoulouba.parser';

@Module({
imports: [HttpModule, PrismaModule, CustomLoggerModule],
providers: [
{ provide: PARSING_EVENT_SERVICE, useClass: ParsingEventService },
{ provide: DINNER_QUEEN_PARSER, useClass: DinnerQueenParser },
{ provide: SEOULOUBA_PARSER, useClass: SeouloubaParser },
ParsingEventWorkerBatch,
],
exports: [PARSING_EVENT_SERVICE],
Expand Down
Loading

0 comments on commit f2ba070

Please sign in to comment.