Skip to content

Commit

Permalink
Merge pull request #3 from OUCC/feat/#1-2
Browse files Browse the repository at this point in the history
#1-2 ScraperSelectorを追加
  • Loading branch information
miyaji255 authored Mar 2, 2024
2 parents c9dbc54 + 7320070 commit d2b3d40
Show file tree
Hide file tree
Showing 12 changed files with 178 additions and 128 deletions.
6 changes: 6 additions & 0 deletions Epub/KoeBook.Epub/Contracts/Services/IFileExtensionService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace KoeBook.Epub.Contracts.Services;

public interface IFileExtensionService
{
public string GetImagesMediaType(string fileName);
}
11 changes: 11 additions & 0 deletions Epub/KoeBook.Epub/Contracts/Services/IScraperSelectorService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
using KoeBook.Epub.Models;

namespace KoeBook.Epub.Contracts.Services;

/// <summary>
/// スクレイピングを行い、EpubDocumentを作成します。
/// </summary>
public interface IScraperSelectorService
{
public ValueTask<EpubDocument> ScrapingAsync(string url, string coverFillePath, string tempDirectory, Guid id, CancellationToken ct);
}
4 changes: 2 additions & 2 deletions Epub/KoeBook.Epub/Contracts/Services/IScrapingService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

namespace KoeBook.Epub.Contracts.Services;

public interface IScrapingService
public interface IScrapingService : IScraperSelectorService
{
public Task<EpubDocument> ScrapingAsync(string url, string coverFillePath, string imageDirectory, Guid id, CancellationToken ct);
public bool IsMatchSite(Uri url);
}
27 changes: 27 additions & 0 deletions Epub/KoeBook.Epub/Models/EpubDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,33 @@ public class EpubDocument(string title, string author, string coverFilePath, Gui
];
public List<Chapter> Chapters { get; set; } = [];

internal void EnsureChapter()
{
if (Chapters.Count == 0)
Chapters.Add(new Chapter() { Title = null });
}

internal void EnsureSection(int chapterIndex)
{
EnsureChapter();

if (Chapters[chapterIndex].Sections.Count == 0)
{
if (Chapters[chapterIndex].Title != null)
Chapters[chapterIndex].Sections.Add(new Section(Chapters[chapterIndex].Title!));
else
Chapters[chapterIndex].Sections.Add(new Section(Title));
}
}

internal void EnsureParagraph(int chapterIndex, int sectionIndex)
{
EnsureSection(chapterIndex);

if (Chapters[chapterIndex].Sections[sectionIndex].Elements.Count == 0)
Chapters[chapterIndex].Sections[sectionIndex].Elements.Add(new Paragraph());
}

public string CreateNavXhtml()
{
var builder = new StringBuilder($"""
Expand Down
90 changes: 0 additions & 90 deletions Epub/KoeBook.Epub/ScrapingHelper.cs

This file was deleted.

19 changes: 19 additions & 0 deletions Epub/KoeBook.Epub/Services/FileExtensionService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using KoeBook.Epub.Contracts.Services;

namespace KoeBook.Epub.Services;

public class FileExtensionService : IFileExtensionService
{
public string GetImagesMediaType(string fileName)
{
return Path.GetExtension(fileName) switch
{
".gif" => "image/gif",
".jpg" or ".jpeg" => "image/jpeg",
".png" => "image/png",
".svg" => "image/svg+xml",
".webp" => "image/webp",
_ => string.Empty,
};
}
}
23 changes: 23 additions & 0 deletions Epub/KoeBook.Epub/Services/ScraperSelectorService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using System.Collections.Immutable;
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;

namespace KoeBook.Epub.Services;

public class ScraperSelectorService(IEnumerable<IScrapingService> scrapingServices) : IScraperSelectorService
{
private readonly ImmutableArray<IScrapingService> _scrapingServices = scrapingServices.ToImmutableArray();

public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFillePath, string tempDirectory, Guid id, CancellationToken ct)
{
var uri = new Uri(url);

foreach (var service in _scrapingServices)
{
if (service.IsMatchSite(uri))
return await service.ScrapingAsync(url, coverFillePath, tempDirectory, id, ct);
}

throw new ArgumentException("対応するURLではありません");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,25 @@
using AngleSharp.Io;
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using static KoeBook.Epub.ScrapingHelper;
using static KoeBook.Epub.Utility.ScrapingHelper;


namespace KoeBook.Epub.Services
{
public partial class ScrapingAozora : IScrapingService
public partial class ScrapingAozoraService : IScrapingService
{
private int chapterNum;
private int sectionNum;
private bool chapterExist = false;
private bool sectionExist = false;

public bool IsMatchSite(Uri uri)
{
return uri.Host == "www.aozora.gr.jp";
}

public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct)
public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct)
{
var chapterNum = 0;
var sectionNum = 0;
var chapterExist = false;
var sectionExist = false;

var config = Configuration.Default.WithDefaultLoader();
using var context = BrowsingContext.New(config);
var doc = await context.OpenAsync(url, ct).ConfigureAwait(false);
Expand Down Expand Up @@ -61,7 +65,7 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
}
if ((MidashiId - previousMidashiId) == 10)
{
checkChapter(document);
document.EnsureChapter();
document.Chapters[^1].Sections.Add(new Section(TextProcess(midashi)));
sectionExist = true;
}
Expand Down Expand Up @@ -97,7 +101,7 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
{
if (previous == true)
{
checkSection(document, chapterNum);
document.EnsureSection(chapterNum);
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());
}
}
Expand Down Expand Up @@ -156,12 +160,12 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
{
if (sectionExist)
{
checkChapter(document);
document.EnsureChapter();
document.Chapters[^1].Sections.Insert(0, new Section("___"));
}
sectionNum++;
}
checkParagraph(document, chapterNum, sectionNum);
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
paragraph.Text += TextProcess(midashi);
Expand All @@ -183,7 +187,7 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
if (element.ClassName == "caption")
{
// https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分
checkParagraph(document, chapterNum, sectionNum);
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
var split = SplitBrace(TextProcess(element));
Expand Down Expand Up @@ -216,12 +220,12 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
{
if (sectionExist)
{
checkChapter(document);
document.EnsureChapter();
document.Chapters[^1].Sections.Insert(0, new Section("___"));
}
sectionNum++;
}
checkParagraph(document, chapterNum, sectionNum);
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
Expand Down Expand Up @@ -253,7 +257,7 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
{
if (sectionExist)
{
checkChapter(document);
document.EnsureChapter();
document.Chapters[^1].Sections.Insert(0, new Section("___"));
}
sectionNum++;
Expand All @@ -274,7 +278,7 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false);
var filePass = System.IO.Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1"));
File.WriteAllBytes(filePass, ms.ToArray());
checkSection(document, chapterNum);
document.EnsureSection(chapterNum);
if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1)
{
document.Chapters[chapterNum].Sections[sectionNum].Elements.Insert(document.Chapters[chapterNum].Sections[sectionNum].Elements.Count - 1, new Picture(filePass));
Expand All @@ -283,7 +287,7 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
}
if (img.AlternativeText != null)
{
checkParagraph(document, chapterNum, sectionNum);
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
paragraph.Text += TextReplace(img.AlternativeText);
Expand Down Expand Up @@ -332,7 +336,7 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
case "[#ページの左右中央]":
break;
default:
checkParagraph(document, chapterNum, sectionNum);
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
Expand Down Expand Up @@ -362,12 +366,12 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
{
if (sectionExist)
{
checkChapter(document);
document.EnsureChapter();
document.Chapters[^1].Sections.Insert(0, new Section("___"));
}
sectionNum++;
}
checkParagraph(document, chapterNum, sectionNum);
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
var split = SplitBrace(TextProcess(element));
Expand Down Expand Up @@ -402,12 +406,12 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
{
if (sectionExist)
{
checkChapter(document);
document.EnsureChapter();
document.Chapters[^1].Sections.Insert(0, new Section("___"));
}
sectionNum++;
}
checkParagraph(document, chapterNum, sectionNum);
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
paragraph.Text += TextProcess(element);
Expand Down Expand Up @@ -450,12 +454,12 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
{
if (sectionExist)
{
checkChapter(document);
document.EnsureChapter();
document.Chapters[^1].Sections.Insert(0, new Section("___"));
}
sectionNum++;
}
checkParagraph(document, chapterNum, sectionNum);
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
var split = SplitBrace(TextReplace(nextNode.Text()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@
using AngleSharp.Io;
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using static KoeBook.Epub.ScrapingHelper;
using static KoeBook.Epub.Utility.ScrapingHelper;

namespace KoeBook.Epub.Services
{
public partial class ScrapingNarouService : IScrapingService
public partial class ScrapingNaroService(IHttpClientFactory httpClientFactory) : IScrapingService
{
public ScrapingNarouService(IHttpClientFactory httpClientFactory)
private readonly IHttpClientFactory _httpCliantFactory = httpClientFactory;

public bool IsMatchSite(Uri uri)
{
_httpCliantFactory = httpClientFactory;
return uri.Host == "ncode.syosetu.com";
}

private readonly IHttpClientFactory _httpCliantFactory;

public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct)
public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct)
{
var config = Configuration.Default.WithDefaultLoader();
using var context = BrowsingContext.New(config);
Expand Down
Loading

0 comments on commit d2b3d40

Please sign in to comment.