diff --git a/backend/src/batch/batch.config.ts b/backend/src/batch/batch.config.ts new file mode 100644 index 0000000..39eeadd --- /dev/null +++ b/backend/src/batch/batch.config.ts @@ -0,0 +1,7 @@ +export const TIME_INTERVAL = 3 * 1000; +export const MAX_RETRY = 3; +export const MAX_DEPTH = 1; +export const SEARCH_BATCH_SIZE = 10; +export const DOI_BATCH_SIZE = 40; +export const DOI_REGEXP = new RegExp(/^[\d]{2}\.[\d]{1,}\/.*/); +export const ALLOW_UPDATE = process.env.ALLOW_UPDATE ? (eval(process.env.ALLOW_UPDATE) as boolean) : false; diff --git a/backend/src/batch/batch.queue.ts b/backend/src/batch/batch.queue.ts new file mode 100644 index 0000000..739a133 --- /dev/null +++ b/backend/src/batch/batch.queue.ts @@ -0,0 +1,20 @@ +import Redis from 'ioredis'; + +export class RedisQueue { + constructor(private redis: Redis, public name: string) {} + async push(value: string, pushLeft = false) { + if (pushLeft) { + await this.redis.lpush(this.name, value); + } else { + await this.redis.rpush(this.name, value); + } + + return value; + } + async pop(count = 1) { + return await this.redis.lpop(this.name, count); + } + async size() { + return await this.redis.llen(this.name); + } +} diff --git a/backend/src/batch/batch.service.ts b/backend/src/batch/batch.service.ts new file mode 100644 index 0000000..6c7239a --- /dev/null +++ b/backend/src/batch/batch.service.ts @@ -0,0 +1,60 @@ +import { InjectRedis } from '@liaoliaots/nestjs-redis'; +import { HttpService } from '@nestjs/axios'; +import { Interval } from '@nestjs/schedule'; +import Redis from 'ioredis'; +import { SearchService } from 'src/search/search.service'; +import { TIME_INTERVAL, SEARCH_BATCH_SIZE, DOI_BATCH_SIZE } from './batch.config'; +import { DoiBatcher } from './batcher.doi'; +import { SearchBatcher } from './batcher.search'; + +export class BatchService { + searchBatcher: SearchBatcher; + doiBatcher: DoiBatcher; + constructor( + @InjectRedis() private readonly redis: Redis, + private readonly httpService: HttpService, + private readonly searchService: SearchService, + ) { + this.searchBatcher = new SearchBatcher( + this.redis, + this.httpService.axiosRef, + this.searchService, + 'url batch queue', + ); + this.doiBatcher = new DoiBatcher(this.redis, this.httpService.axiosRef, this.searchService, 'paper batch queue'); + } + keywordToRedisKey(keyword: string) { + return `s:${keyword}`; + } + async keywordExist(keyword: string) { + const key = this.keywordToRedisKey(keyword); + return (await this.redis.ttl(key)) >= 0; + } + async setKeyword(keyword: string) { + if (await this.keywordExist(keyword)) return false; + const key = this.keywordToRedisKey(keyword); + this.redis.set(key, 1); + this.redis.expire(key, 60 * 60 * 24); + this.searchBatcher.pushToQueue(0, 0, -1, true, keyword); + return true; + } + async setDoi(doi: string) { + this.doiBatcher.pushToQueue(0, 0, -1, true, doi); + } + + @Interval(TIME_INTERVAL) + async batchSearchQueue(batchSize = SEARCH_BATCH_SIZE) { + const referencesDoiWithDepth = await this.searchBatcher.runBatch(batchSize); + referencesDoiWithDepth?.forEach((v) => { + this.doiBatcher.pushToQueue(0, v.depth + 1, -1, false, v.doi); + }); + } + + @Interval(TIME_INTERVAL) + async batchDoiQueue(batchSize = DOI_BATCH_SIZE) { + const referencesDoiWithDepth = await this.doiBatcher.runBatch(batchSize); + referencesDoiWithDepth?.forEach((v) => { + this.doiBatcher.pushToQueue(0, v.depth + 1, -1, false, v.doi); + }); + } +} diff --git a/backend/src/batch/batcher.abstract.ts b/backend/src/batch/batcher.abstract.ts new file mode 100644 index 0000000..298de5f --- /dev/null +++ b/backend/src/batch/batcher.abstract.ts @@ -0,0 +1,171 @@ +import { GetGetResult } from '@elastic/elasticsearch/lib/api/types'; +import { Injectable } from '@nestjs/common'; +import { AxiosError, AxiosInstance, AxiosResponse } from 'axios'; +import Redis from 'ioredis'; +import { + CrossRefItem, + CrossRefPaperResponse, + CrossRefResponse, + PaperInfoDetail, + ReferenceInfo, +} from 'src/search/entities/crossRef.entity'; +import { SearchService } from 'src/search/search.service'; +import { ALLOW_UPDATE, MAX_DEPTH } from './batch.config'; +import { RedisQueue } from './batch.queue'; + +export interface QueueItemParsed { + retries: number; + depth: number; + pagesLeft: number; + url: string; +} +export interface UrlParams { + doi?: string; + keyword?: string; + cursor?: string; +} + +type CrossRefAble = CrossRefResponse | CrossRefPaperResponse; + +@Injectable() +export abstract class Batcher { + queue: RedisQueue; + // failedQueue: RedisQueue; + constructor( + private readonly redis: Redis, + private readonly axios: AxiosInstance, + readonly searchService: SearchService, + readonly name: string, + ) { + this.queue = new RedisQueue(this.redis, name); + // this.failedQueue = new RedisQueue(this.redis, name); + } + abstract makeUrl(...params: string[]): string; + abstract getParamsFromUrl(url: string): UrlParams; + abstract onFulfilled( + item: QueueItemParsed, + params: UrlParams, + res: AxiosResponse, + i?: number, + ): { papers: PaperInfoDetail[]; referenceDOIs: string[] }; // paper들의 reference들에 대한 doi 목록 + abstract onRejected(item: QueueItemParsed, params: UrlParams, res?: PromiseRejectedResult, i?: number): any; + abstract validateBatchItem(item: QueueItemParsed): boolean; + + parseQueueItem(value: string) { + const splits = value.split(':'); + return { + retries: parseInt(splits[0]), + depth: parseInt(splits[1]), + pagesLeft: parseInt(splits[2]), + url: splits.slice(3).join(':'), + } as QueueItemParsed; + } + + pushToQueue(retries = 0, depth = 0, page = -1, pushLeft = false, ...params: string[]) { + const url = this.makeUrl(...params); + this.queue.push(`${retries}:${depth}:${page}:${url}`, pushLeft); + } + async batchLog(queue: RedisQueue, batched: string[]) { + const urlQueueSize = await queue.size(); + const batchedSize = batched?.length || 0; + (urlQueueSize || batchedSize) && console.log(`${queue.name} size`, urlQueueSize, ', batch size ', batchedSize); + } + fetchCrossRef(url: string) { + return this.axios.get(url); + } + + async runBatch(batchSize: number) { + const queue = this.queue; + // const failedQueue = this.failedQueue; + const batched = await queue.pop(batchSize); + + // await this.batchLog(queue, batched); + if (!batched) return; + const items = batched.map((item) => this.parseQueueItem(item)).filter((item) => this.validateBatchItem(item)); + const responses = await this.batchRequest(items); + const { papers, doiWithDepth } = this.responsesParser(items, responses); + const bulkPapers = await this.makeBulkIndex(papers); + this.doBulkIndex(bulkPapers); + return doiWithDepth; + } + batchRequest(items: QueueItemParsed[]) { + return Promise.allSettled(items.map((item) => this.fetchCrossRef(item.url))); + } + + responsesParser(items: QueueItemParsed[], responses: PromiseSettledResult>[]) { + return responses + .map((res, i) => { + const params = this.getParamsFromUrl(items[i].url); + if (res.status === 'fulfilled') { + return this.onFulfilled(items[i], params, res.value, i); + } else { + const error = res.reason as AxiosError; + // Resource not found. + if (error.response?.status === 404) return; + + // timeout of 20000ms exceeded + this.onRejected(items[i], params, res, i); + } + }) + .reduce( + (acc, cur, i) => { + if (cur?.papers) Array.prototype.push.apply(acc.papers, cur.papers); + if (cur?.referenceDOIs) { + const doiWithDepth = cur.referenceDOIs.map((doi) => { + return { doi, depth: items[i].depth }; + }); + Array.prototype.push.apply(acc.doiWithDepth, doiWithDepth); + } + return acc; + }, + { papers: [], doiWithDepth: [] as { doi: string; depth: number }[] }, + ); + } + + async makeBulkIndex(papers: PaperInfoDetail[]): Promise { + if (ALLOW_UPDATE) return papers; + const dois = papers.map((paper) => { + return paper.doi; + }); + const { docs } = await this.searchService.multiGet(dois); + const indexes = docs + .map((doc, i) => { + if ((doc as GetGetResult).found) return; + return papers[i]; + }) + .filter(Boolean); + + // console.log(`${this.queue.name} skipped papers:`, papers.length - indexes.length); + return indexes; + } + doBulkIndex(papers: PaperInfoDetail[]) { + return this.searchService.bulkInsert(papers); + } + + getPapersToRequest(item: CrossRefItem, depth: number) { + const hasHope = this.paperHasInformation(item); + const papers: string[] = []; + if (hasHope && depth < MAX_DEPTH) { + if (item.DOI) { + papers.push(item.DOI); + } + item.reference?.forEach((ref) => { + // doi가 있는 reference에 대해 paperQueue에 집어넣는다. + if (this.referenceHasInformation(ref)) { + // return [ref.DOI, 0, depth + 1]; + papers.push(ref.DOI); + } + }); + } + return papers; + } + + paperHasInformation(paper: CrossRefItem) { + // DOI와 제목이 있는 논문만 db에 저장한다. + return paper.DOI && paper.title; + } + + referenceHasInformation(reference: ReferenceInfo) { + return reference['DOI']; + } +} diff --git a/backend/src/batch/batcher.doi.ts b/backend/src/batch/batcher.doi.ts new file mode 100644 index 0000000..2a71f73 --- /dev/null +++ b/backend/src/batch/batcher.doi.ts @@ -0,0 +1,50 @@ +import { AxiosInstance, AxiosResponse } from 'axios'; +import Redis from 'ioredis'; +import { CrossRefPaperResponse, PaperInfoDetail } from 'src/search/entities/crossRef.entity'; +import { SearchService } from 'src/search/search.service'; +import { CROSSREF_API_PAPER_URL } from 'src/util'; +import { DOI_REGEXP, MAX_RETRY } from './batch.config'; +import { Batcher, UrlParams, QueueItemParsed } from './batcher.abstract'; + +export class DoiBatcher extends Batcher { + constructor(redis: Redis, axios: AxiosInstance, searchService: SearchService, name: string) { + super(redis, axios, searchService, name); + } + makeUrl(doi: string) { + return CROSSREF_API_PAPER_URL(doi); + } + getParamsFromUrl(url: string): UrlParams { + const u = new URL(url); + const doi = u.pathname.replace(/\/works\//, ''); + return { doi }; + } + validateBatchItem(item: QueueItemParsed): boolean { + const { doi } = this.getParamsFromUrl(item.url); + // DOI 대문자일 경우 검색 안 되는 경우 발생 + item.url = item.url.toLowerCase(); + return DOI_REGEXP.test(doi); + } + onFulfilled( + item: QueueItemParsed, + params: UrlParams, + res: AxiosResponse, + ): { papers: PaperInfoDetail[]; referenceDOIs: string[] } { + const paper = res.data.message; + const { depth } = item; + const referenceDOIs = this.getPapersToRequest(paper, depth); + const p = this.searchService.parsePaperInfoDetail(paper); + return { papers: [p], referenceDOIs }; + } + + onRejected(item: QueueItemParsed, params: UrlParams) { + const { doi } = params; + if (item.retries + 1 > MAX_RETRY) { + // this.failedQueue.push(item.url); + return; + } + console.log('error', item.url); + item.retries++; + this.pushToQueue(item.retries + 1, item.depth, item.pagesLeft - 1, false, doi); + return; + } +} diff --git a/backend/src/batch/batcher.search.ts b/backend/src/batch/batcher.search.ts new file mode 100644 index 0000000..e30835d --- /dev/null +++ b/backend/src/batch/batcher.search.ts @@ -0,0 +1,57 @@ +import { AxiosInstance, AxiosResponse } from 'axios'; +import Redis from 'ioredis'; +import { CrossRefResponse } from 'src/search/entities/crossRef.entity'; +import { SearchService } from 'src/search/search.service'; +import { CROSSREF_API_URL_CURSOR, MAX_ROWS } from 'src/util'; +import { MAX_RETRY } from './batch.config'; +import { Batcher, QueueItemParsed, UrlParams } from './batcher.abstract'; + +export class SearchBatcher extends Batcher { + constructor(redis: Redis, axios: AxiosInstance, searchService: SearchService, name: string) { + super(redis, axios, searchService, name); + } + + makeUrl(keyword: string, cursor: string) { + return CROSSREF_API_URL_CURSOR(keyword, cursor); + } + + getParamsFromUrl(url: string): UrlParams { + const u = new URL(url); + const params = new URLSearchParams(u.search); + const keyword = params.get('query'); + const cursor = params.get('cursor') || '*'; + return { keyword, cursor }; + } + validateBatchItem(item: QueueItemParsed): boolean { + return true; + } + onFulfilled(item: QueueItemParsed, params: UrlParams, res: AxiosResponse) { + const { cursor: presentCursor, keyword } = params; + if (presentCursor === '*') { + const cursor = res.data.message['next-cursor']; + const maxPage = Math.floor(res.data.message['total-results'] / MAX_ROWS); + this.pushToQueue(0, item.depth + 1, maxPage, true, keyword, cursor); + } else if (item.pagesLeft > 0) { + this.pushToQueue(0, item.depth, item.pagesLeft - 1, false, keyword, presentCursor); + } + const referenceDOIs = res.data.message.items.flatMap((paper) => { + return this.getPapersToRequest(paper, item.depth); + }); + const papers = res.data.message.items.map((paper) => { + return this.searchService.parsePaperInfoDetail(paper); + }); + return { papers, referenceDOIs }; + } + + onRejected(item: QueueItemParsed, params: UrlParams) { + const { keyword, cursor } = params; + if (item.retries + 1 > MAX_RETRY) { + // this.failedQueue.push(item.url); + return; + } + console.log('error', item.url); + item.retries++; + this.pushToQueue(item.retries + 1, item.depth, item.pagesLeft - 1, false, keyword, cursor); + return; + } +} diff --git a/backend/src/search/entities/crossRef.entity.ts b/backend/src/search/entities/crossRef.entity.ts index cadf38d..090c575 100644 --- a/backend/src/search/entities/crossRef.entity.ts +++ b/backend/src/search/entities/crossRef.entity.ts @@ -9,6 +9,9 @@ export class PaperInfo { @ApiProperty() doi?: string; + @ApiProperty() + key?: string; + constructor(body: PaperInfo) { this.title = body.title; this.authors = body.authors; @@ -33,15 +36,15 @@ export class PaperInfoExtended extends PaperInfo { } } export class PaperInfoDetail extends PaperInfoExtended { - @ApiPropertyOptional() - referenceList?: ReferenceInfo[]; + @ApiPropertyOptional({ type: PaperInfoExtended, isArray: true }) + referenceList?: PaperInfoExtended[]; constructor(body: PaperInfoDetail) { super(body); this.referenceList = body.referenceList; } } -export interface ReferenceInfo { +export class ReferenceInfo { issn?: string; 'standards-body'?: string; issue?: string; @@ -66,6 +69,7 @@ export interface ReferenceInfo { } export interface CrossRefResponse { message: { + 'next-cursor'?: string; 'total-results': number; items: CrossRefItem[]; }; @@ -87,5 +91,5 @@ export interface CrossRefItem { reference?: ReferenceInfo[]; } export interface CrossRefPaperResponse { - message: CrossRefItem; + message: CrossRefItem & { 'next-cursor'?: string; 'total-results'?: number }; } diff --git a/backend/src/search/entities/search.dto.ts b/backend/src/search/entities/search.dto.ts index 1579d79..3a402fd 100644 --- a/backend/src/search/entities/search.dto.ts +++ b/backend/src/search/entities/search.dto.ts @@ -1,7 +1,8 @@ /* eslint-disable @typescript-eslint/no-inferrable-types */ import { Transform } from 'class-transformer'; -import { IsOptional, IsPositive, IsString, MinLength } from 'class-validator'; +import { IsOptional, IsPositive, IsString, Matches, MinLength } from 'class-validator'; import { ApiProperty, ApiPropertyOptional } from '@nestjs/swagger'; +import { DOI_REGEXP } from 'src/batch/batch.config'; export class SearchDto { @ApiProperty({ @@ -48,6 +49,7 @@ export class GetPaperDto { example: '10.1234/qwer.asdf', description: '논문의 DOI', }) - @IsString() + @Matches(DOI_REGEXP, { message: 'DOI 형식이 올바르지 않습니다.' }) + @Transform((params) => params.value.toLowerCase()) doi: string; } diff --git a/backend/src/search/search.controller.ts b/backend/src/search/search.controller.ts index f475660..b8a56d9 100644 --- a/backend/src/search/search.controller.ts +++ b/backend/src/search/search.controller.ts @@ -1,52 +1,53 @@ import { Controller, Get, NotFoundException, Query, UsePipes, ValidationPipe } from '@nestjs/common'; import { SearchService } from './search.service'; import { AutoCompleteDto, GetPaperDto, SearchDto } from './entities/search.dto'; -import { SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'; -import { CROSSREF_CACHE_QUEUE } from 'src/util'; -import { Interval } from '@nestjs/schedule'; +import { GetGetResult, SearchTotalHits } from '@elastic/elasticsearch/lib/api/types'; import { RankingService } from 'src/ranking/ranking.service'; -import { ApiResponse, ApiRequestTimeoutResponse, ApiBadRequestResponse } from '@nestjs/swagger'; +import { BatchService } from 'src/batch/batch.service'; +import { ApiResponse, ApiRequestTimeoutResponse, ApiBadRequestResponse, ApiNotFoundResponse } from '@nestjs/swagger'; import { PaperInfo, PaperInfoDetail, PaperInfoExtended } from './entities/crossRef.entity'; @Controller('search') export class SearchController { - constructor(private readonly searchService: SearchService, private readonly rankingService: RankingService) {} + constructor( + private readonly searchService: SearchService, + private readonly rankingService: RankingService, + private readonly batchService: BatchService, + ) {} @ApiResponse({ status: 200, description: '자동검색 성공', type: PaperInfo, isArray: true }) @ApiRequestTimeoutResponse({ description: '검색 timeout' }) - @ApiBadRequestResponse({ status: 400, description: '유효하지 않은 키워드' }) + @ApiBadRequestResponse({ description: '유효하지 않은 키워드' }) + @ApiNotFoundResponse({ description: '검색 결과가 존재하지 않습니다. 정보를 수집중입니다.' }) @Get('auto-complete') @UsePipes(new ValidationPipe({ transform: true })) async getAutoCompletePapers(@Query() query: AutoCompleteDto) { const { keyword } = query; - const elastic = await this.searchService.getElasticSearch(keyword); - const elasticDataCount = (elastic.hits.total as SearchTotalHits).value; - if (elasticDataCount > 0) { - return elastic.hits.hits.map((paper) => paper._source); - } - const selects = ['title', 'author', 'DOI']; - const { items, totalItems } = await this.searchService.getCrossRefData(keyword, 5, 1, selects); - const papers = this.searchService.parseCrossRefData(items, this.searchService.parsePaperInfo); - this.searchService.crawlAllCrossRefData(keyword, totalItems, 1000); + const data = await this.searchService.getElasticSearch(keyword); + const papers = data.hits.hits.map((paper) => new PaperInfo(paper._source)); + if (papers.length === 0) throw new NotFoundException('검색 결과가 존재하지 않습니다. 정보를 수집중입니다.'); return papers; } @ApiResponse({ status: 200, description: '검색 결과', type: PaperInfoExtended, isArray: true }) @ApiRequestTimeoutResponse({ description: '검색 timeout' }) - @ApiBadRequestResponse({ status: 400, description: '유효하지 않은 keyword | rows | page' }) + @ApiBadRequestResponse({ description: '유효하지 않은 keyword | rows | page' }) + @ApiNotFoundResponse({ description: '검색 결과가 존재하지 않습니다. 정보를 수집중입니다.' }) @Get() @UsePipes(new ValidationPipe({ transform: true })) async getPapers(@Query() query: SearchDto) { const { keyword, rows, page } = query; - const selects = ['title', 'author', 'created', 'is-referenced-by-count', 'references-count', 'DOI']; - const { items, totalItems } = await this.searchService.getCrossRefData(keyword, rows, page, selects); + const data = await this.searchService.getElasticSearch(keyword, rows, rows * (page - 1)); + const totalItems = (data.hits.total as SearchTotalHits).value; const totalPages = Math.ceil(totalItems / rows); if (page > totalPages) { throw new NotFoundException(`page(${page})는 ${totalPages} 보다 클 수 없습니다.`); } - const papers = this.searchService.parseCrossRefData(items, this.searchService.parsePaperInfoExtended); this.rankingService.insertRedis(keyword); + this.batchService.setKeyword(keyword); + const papers = data.hits.hits.map((paper) => new PaperInfoExtended(paper._source)); + if (papers.length === 0) throw new NotFoundException('검색 결과가 존재하지 않습니다. 정보를 수집중입니다.'); return { papers, pageInfo: { @@ -55,24 +56,32 @@ export class SearchController { }, }; } - @Interval('notifications', 1000) - handleInterval() { - //ToDo 진짜 queue로 변경해야함 리스트에서 shift를 쓰면 최악의 경우 O(n)발생 우선 pop으로 지정 - if (CROSSREF_CACHE_QUEUE.isEmpty()) return; - else { - const url = CROSSREF_CACHE_QUEUE.pop(); - this.searchService.getCacheFromCrossRef(url); - } - console.log(new Array(...CROSSREF_CACHE_QUEUE.data)); - } @ApiResponse({ status: 200, description: '논문 상세정보 검색 결과', type: PaperInfoDetail }) @ApiRequestTimeoutResponse({ description: '검색 timeout' }) @ApiBadRequestResponse({ description: '유효하지 않은 doi' }) + @ApiNotFoundResponse({ description: '해당 doi는 존재하지 않습니다. 정보를 수집중입니다.' }) @Get('paper') - @UsePipes(new ValidationPipe()) + @UsePipes(new ValidationPipe({ transform: true })) async getPaper(@Query() query: GetPaperDto) { const { doi } = query; - return await this.searchService.getPaper(doi); + + const paper = await this.searchService.getPaper(doi); + if (paper) { + const origin = new PaperInfoDetail(paper._source); + const references = await this.searchService.multiGet(origin.referenceList.map((ref) => ref.key).filter(Boolean)); + references.docs + .filter((doc) => !(doc as GetGetResult).found) + .forEach((doc) => { + this.batchService.doiBatcher.pushToQueue(0, 1, -1, true, doc._id); + }); + const referenceList = references.docs.map((doc) => { + const _source = (doc as GetGetResult)._source; + return { key: doc._id, ..._source }; + }); + return { ...origin, referenceList }; + } + this.batchService.doiBatcher.pushToQueue(0, 1, -1, true, doi); + throw new NotFoundException('해당 doi는 존재하지 않습니다. 정보를 수집중입니다.'); } } diff --git a/backend/src/search/search.module.ts b/backend/src/search/search.module.ts index 3361bcb..c5ac5f4 100644 --- a/backend/src/search/search.module.ts +++ b/backend/src/search/search.module.ts @@ -6,6 +6,7 @@ import { ElasticsearchModule } from '@nestjs/elasticsearch'; import { HttpConnection } from '@elastic/elasticsearch'; import { ScheduleModule } from '@nestjs/schedule'; import { RankingService } from 'src/ranking/ranking.service'; +import { BatchService } from 'src/batch/batch.service'; @Module({ imports: [ HttpModule.register({ @@ -31,6 +32,6 @@ import { RankingService } from 'src/ranking/ranking.service'; ScheduleModule.forRoot(), ], controllers: [SearchController], - providers: [SearchService, RankingService], + providers: [SearchService, RankingService, BatchService], }) export class SearchModule {} diff --git a/backend/src/search/search.service.ts b/backend/src/search/search.service.ts index c7c3847..07cbfc1 100644 --- a/backend/src/search/search.service.ts +++ b/backend/src/search/search.service.ts @@ -1,52 +1,12 @@ -import { Injectable, RequestTimeoutException } from '@nestjs/common'; -import { HttpService } from '@nestjs/axios'; -import { - CrossRefResponse, - CrossRefItem, - PaperInfoExtended, - PaperInfo, - CrossRefPaperResponse, - PaperInfoDetail, -} from './entities/crossRef.entity'; -import { CROSSREF_API_PAPER_URL, CROSSREF_API_URL, CROSSREF_CACHE_QUEUE } from '../util'; +import { Injectable, NotFoundException } from '@nestjs/common'; +import { CrossRefItem, PaperInfoExtended, PaperInfo, PaperInfoDetail } from './entities/crossRef.entity'; import { ElasticsearchService } from '@nestjs/elasticsearch'; -import { SearchHit } from '@elastic/elasticsearch/lib/api/types'; +import { MgetOperation, SearchHit } from '@elastic/elasticsearch/lib/api/types'; @Injectable() export class SearchService { - constructor(private readonly httpService: HttpService, private readonly esService: ElasticsearchService) {} - async getCrossRefData(keyword: string, rows: number, page: number, selects?: string[]) { - const crossRefdata = await this.httpService.axiosRef - .get(CROSSREF_API_URL(keyword, rows, page, selects)) - .catch((err) => { - throw new RequestTimeoutException(err.message); - }); - const items = crossRefdata.data.message.items; - const totalItems = crossRefdata.data.message['total-results']; - return { items, totalItems }; - } - async crawlAllCrossRefData(keyword: string, totalItems: number, rows: number) { - if (totalItems >= 10000) totalItems = 10000; - await Promise.all( - Array(Math.ceil(totalItems / rows)) - .fill(0) - .map((v, i) => { - CROSSREF_CACHE_QUEUE.push( - CROSSREF_API_URL(keyword, rows, i + 1, [ - 'title', - 'author', - 'created', - 'is-referenced-by-count', - 'references-count', - 'DOI', - ]), - ); - }), - ); - } - parseCrossRefData(items: CrossRefItem[], parser: (item: CrossRefItem) => T) { - return items.map(parser).filter((info) => info.title); - } + constructor(private readonly esService: ElasticsearchService) {} + parsePaperInfo = (item: CrossRefItem) => { const data = { title: item.title?.[0], @@ -56,6 +16,7 @@ export class SearchService { return acc; }, []), doi: item.DOI, + key: item.DOI, }; return new PaperInfo(data); @@ -82,11 +43,6 @@ export class SearchService { reference['volume-title'] || reference.unstructured, doi: reference['DOI'], - // TODO: 현재 원하는 정보를 얻기 위해서는 해당 reference에 대한 정보를 crossref에 다시 요청해야함 - author: reference['author'], - publishedAt: reference['year'], - citations: 0, - references: 0, }; }) || []; const data = { @@ -98,8 +54,12 @@ export class SearchService { }; async getPaper(doi: string) { - const item = await this.httpService.axiosRef.get(CROSSREF_API_PAPER_URL(doi)); - return this.parsePaperInfoDetail(item.data.message); + try { + const paper = await this.esService.get({ index: process.env.ELASTIC_INDEX, id: doi }); + return paper; + } catch (_) { + return false; + } } async putElasticSearch(paper: PaperInfoExtended) { return await this.esService.index({ @@ -110,7 +70,7 @@ export class SearchService { }, }); } - async getElasticSearch(keyword: string, size = 5) { + async getElasticSearch(keyword: string, size = 5, from = 0) { const query = { bool: { should: [ @@ -132,23 +92,35 @@ export class SearchService { }, }; return await this.esService - .search({ + .search({ index: process.env.ELASTIC_INDEX, + from, size, query, }) .catch(() => { - return { hits: { hits: [] as SearchHit[], total: 0 } }; + return { hits: { hits: [] as SearchHit[], total: 0 } }; }); } - async getCacheFromCrossRef(url: string) { - try { - const crossRefdata = await this.httpService.axiosRef.get(url); - const items = crossRefdata.data.message.items; - const papers = this.parseCrossRefData(items, this.parsePaperInfoExtended); - papers.map((paper) => { - this.putElasticSearch(paper); - }); - } catch (error) {} + + async bulkInsert(papers: PaperInfoDetail[]) { + const dataset = papers.map((paper) => { + return { id: paper.doi, ...paper }; + }); + if (dataset.length <= 0) return; + const operations = dataset.flatMap((doc) => [{ index: { _index: process.env.ELASTIC_INDEX, _id: doc.id } }, doc]); + const bulkResponse = await this.esService.bulk({ refresh: true, operations }); + // console.log(`bulk insert response : ${bulkResponse.items.length}`); + } + async multiGet(ids: string[]) { + if (ids.length === 0) return { docs: [] }; + const docs: MgetOperation[] = ids.map((id) => { + return { + _index: process.env.ELASTIC_INDEX, + _id: id, + _source: { include: ['key', 'title', 'authors', 'doi', 'publishedAt', 'citations', 'references'] }, + }; + }); + return await this.esService.mget({ docs }); } } diff --git a/backend/src/search/tests/search.controller.spec.ts b/backend/src/search/tests/search.controller.spec.ts index d08f477..63947b4 100644 --- a/backend/src/search/tests/search.controller.spec.ts +++ b/backend/src/search/tests/search.controller.spec.ts @@ -4,10 +4,11 @@ import { HttpService } from '@nestjs/axios'; import { PaperInfo, PaperInfoDetail, PaperInfoExtended } from '../entities/crossRef.entity'; import { Test, TestingModule } from '@nestjs/testing'; import { ElasticsearchService } from '@nestjs/elasticsearch'; -import { mockElasticService, mockHttpService, mockRankingService } from './search.service.mock'; +import { mockBatchService, mockElasticService, mockHttpService, mockRankingService } from './search.service.mock'; import { RankingService } from '../../ranking/ranking.service'; import { INestApplication, ValidationPipe } from '@nestjs/common'; import * as request from 'supertest'; +import { BatchService } from 'src/batch/batch.service'; describe('SearchController', () => { let controller: SearchController; @@ -15,13 +16,14 @@ describe('SearchController', () => { let app: INestApplication; let spyGetElasticSearch: jest.SpyInstance; - let spyGetCrossRefData: jest.SpyInstance; const keyword = 'coffee'; beforeEach(async () => { const httpService = mockHttpService(); const rankingService = mockRankingService(); const elasticService = mockElasticService(); + const batchService = mockBatchService(); + const module: TestingModule = await Test.createTestingModule({ controllers: [SearchController], providers: [ @@ -38,12 +40,16 @@ describe('SearchController', () => { provide: ElasticsearchService, useValue: elasticService, }, + { + provide: BatchService, + useValue: batchService, + }, ], }).compile(); controller = module.get(SearchController); service = module.get(SearchService); spyGetElasticSearch = jest.spyOn(service, 'getElasticSearch'); - spyGetCrossRefData = jest.spyOn(service, 'getCrossRefData'); + // spyGetCrossRefData = jest.spyOn(service, 'getCrossRefData'); app = module.createNestApplication(); app.useGlobalPipes(new ValidationPipe()); @@ -68,7 +74,6 @@ describe('SearchController', () => { }); expect(spyGetElasticSearch).toBeCalledTimes(2); - expect(spyGetCrossRefData).toBeCalledTimes(1); }); it('keyword 미포함시 error - GET /search/auto-complete?keyword=', () => { const url = (keyword: string) => `/search/auto-complete?keyword=${keyword}`; @@ -86,9 +91,7 @@ describe('SearchController', () => { items.forEach((item) => { expect(item).toBeInstanceOf(PaperInfoExtended); }); - // TODO: elasticsearch로 검색? - expect(spyGetElasticSearch).toBeCalledTimes(0); - expect(spyGetCrossRefData).toBeCalledTimes(1); + expect(spyGetElasticSearch).toBeCalledTimes(1); }); it('keyword 미포함시 error - GET /search?keyword=', () => { const url = (keyword: string) => `/search?keyword=${keyword}`; @@ -121,9 +124,9 @@ describe('SearchController', () => { it(`getPaper - doi=10.1234/some_doi 일 때 PaperInfoDetail을 return`, async () => { const doi = '10.1234/some_doi'; const paper = await controller.getPaper({ doi }); - expect(paper.references).toBe(5); - expect(paper.referenceList.length).toBe(5); - expect(paper).toBeInstanceOf(PaperInfoDetail); + expect(paper.references).toBe(10); + expect(paper.referenceList.length).toBe(10); + expect(() => new PaperInfoDetail(paper)).not.toThrow(); }); it('doi가 입력되지 않을 경우 error - GET /search/paper?doi=', () => { const url = (keyword: string) => `/search/paper?doi=${keyword}`; diff --git a/backend/src/search/tests/search.service.mock.ts b/backend/src/search/tests/search.service.mock.ts index 356617b..667d838 100644 --- a/backend/src/search/tests/search.service.mock.ts +++ b/backend/src/search/tests/search.service.mock.ts @@ -1,7 +1,8 @@ import mockCrossRefData from './crossref.mock'; import mockSearchData from './searchdata.mock'; import { HttpService } from '@nestjs/axios'; -import { CrossRefPaperResponse, CrossRefResponse, PaperInfo } from '../entities/crossRef.entity'; +import { CrossRefPaperResponse, CrossRefResponse, PaperInfo, PaperInfoDetail } from '../entities/crossRef.entity'; +import { DoiBatcher } from 'src/batch/batcher.doi'; export function mockHttpService() { const httpService = new HttpService(); @@ -36,30 +37,37 @@ export function mockElasticService() { const index = jest.fn().mockResolvedValue(() => { return true; }); - const search = jest.fn(); - search - .mockResolvedValueOnce({ + const search = jest.fn().mockImplementation(({ size }) => { + return Promise.resolve({ hits: { total: { - value: 0, - }, - }, - }) - .mockResolvedValue({ - hits: { - total: { - value: 222, + value: 28810, }, hits: mockSearchData - .map((data) => { + .map((data, key) => { return { - _source: new PaperInfo(data), + _source: { ...new PaperInfo(data), key }, }; }) - .slice(0, 5), + .slice(0, size), }, }); - const elasticService = { index, search }; + }); + const get = jest.fn().mockResolvedValue({ + _source: { + ...mockSearchData[0], + referenceList: Array.from({ length: mockSearchData[0].references || 0 }, (_, i) => { + return { key: mockSearchData[i].doi }; + }), + }, + }); + const mget = jest.fn().mockResolvedValue({ + docs: Array.from({ length: mockSearchData[0].references || 0 }, (_, i) => { + const data = mockSearchData[i]; + return { _id: data.doi, found: Math.random() > 0.5, _source: data }; + }), + }); + const elasticService = { index, search, get, mget }; return elasticService; } @@ -70,3 +78,14 @@ export function mockRankingService() { const rankingService = { insertRedis }; return rankingService; } + +export function mockBatchService() { + const setKeyword = jest.fn().mockResolvedValue(() => { + return true; + }); + const doiBatcher = { + pushToQueue: jest.fn(), + }; + const batchService = { setKeyword, doiBatcher }; + return batchService; +} diff --git a/backend/src/search/tests/searchdata.mock.ts b/backend/src/search/tests/searchdata.mock.ts index 8a38a6c..ecf9953 100644 --- a/backend/src/search/tests/searchdata.mock.ts +++ b/backend/src/search/tests/searchdata.mock.ts @@ -7,7 +7,7 @@ export default [ doi: '10.1021/acs.iecr.9b00714.s001', publishedAt: '2020-04-09T15:47:35Z', citations: 0, - references: 0, + references: 10, }, { title: diff --git a/backend/src/util.ts b/backend/src/util.ts index 084b99e..5237335 100644 --- a/backend/src/util.ts +++ b/backend/src/util.ts @@ -1,18 +1,29 @@ const BASE_URL = 'https://api.crossref.org/works'; export const CROSSREF_API_URL = (keyword: string, rows = 5, page = 1, selects: string[] = ['author', 'title', 'DOI']) => - `${BASE_URL}?query=${keyword}&rows=${rows}&select=${selects.join(',')}&offset=${rows * (page - 1)}`; + `${BASE_URL}?query=${keyword}&rows=${rows}&select=${selects.join(',')}&offset=${ + rows * (page - 1) + }&sort=is-referenced-by-count`; + +export const MAX_ROWS = 1000; +export const CROSSREF_API_URL_CURSOR = ( + keyword: string, + cursor = '*', + rows = MAX_ROWS, + selects: string[] = ['title', 'author', 'created', 'is-referenced-by-count', 'references-count', 'DOI', 'reference'], +) => + `${BASE_URL}?query=${keyword}&rows=${rows}&select=${selects.join(',')}&sort=is-referenced-by-count&cursor=${cursor}`; export const CROSSREF_API_PAPER_URL = (doi: string) => `${BASE_URL}/${doi}`; -class Queue { - data: Set; +export class Queue { + data: Set; constructor() { this.data = new Set(); } pop() { - const firstValue = this.data[Symbol.iterator]().next().value; + const firstValue = this.data[Symbol.iterator]().next().value as T; this.data.delete(firstValue); return firstValue; } - push(value: string) { + push(value: T) { if (!this.data.has(value)) this.data.add(value); } isEmpty() {