Skip to content

Commit

Permalink
refactor: 카테고리별 크롤링 개수 제한하도록 로직 수정
Browse files Browse the repository at this point in the history
  • Loading branch information
yeonjy committed Apr 2, 2024
1 parent e826bce commit 73fa419
Showing 1 changed file with 22 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
Expand All @@ -25,6 +26,9 @@ public class NewsCrawlingService {
private static final String CRON = "0 0 6,12 * * *";
private static final String ZONE = "Asia/Seoul";

@Value("${crawling.quantity}")
private int crawlingQuantity;

private final NewsService newsService;

@Transactional
Expand All @@ -34,19 +38,32 @@ public void scrap() throws IOException {
String categoryUrl = MAIN_URL + category.getNum();
String categoryName = category.getName();

scrapNewsUrls(categoryUrl);
scrapCategoryNews(categoryUrl);
for (final News news : newsService.getNotCrawled()) {
scrapNewsContentsAndUpdate(categoryName, news);
Document doc = Jsoup.connect(news.getUrl()).get();
String title = scrapTitle(doc);
String content = scrapContent(doc);
String postDate = scrapPostDate(doc);

news.addNewsBody(title, content, categoryName, postDate);
}
}
newsService.summarizeNewsContent();
}

private void scrapNewsUrls(String categoryUrl) throws IOException {
private void scrapCategoryNews(String categoryUrl) throws IOException {
Document doc = Jsoup.connect(categoryUrl).get();
Elements newsList = doc.select(".sa_list");
Elements newsList = doc.select(".sa_list").select("li");
if (newsList.size() < crawlingQuantity) {
scrapNewsUrl(newsList.size(), newsList);
return;
}
scrapNewsUrl(crawlingQuantity, newsList);
}

for (Element news : newsList.select("li")) {
private void scrapNewsUrl(int quantity, Elements newsList) {
for (int i = 0; i < quantity; i++) {
Element news = newsList.get(i);
String thumbnailUrl = scrapThumbnailUrl(news);
String url = Objects.requireNonNull(news.selectFirst(".sa_text_title")).attr("href");

Expand Down

0 comments on commit 73fa419

Please sign in to comment.