Skip to content

Commit

Permalink
Merge pull request #21 from OUCC/feat/#13
Browse files Browse the repository at this point in the history
splitBraceに渡すテキストが不適切に分割される問題の修正
  • Loading branch information
miyaji255 authored Mar 30, 2024
2 parents f84e751 + c5ed389 commit ec1e0a7
Show file tree
Hide file tree
Showing 11 changed files with 287 additions and 78 deletions.
8 changes: 8 additions & 0 deletions Epub/KoeBook.Epub/Contracts/Services/ISplitBraceService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
namespace KoeBook.Epub.Contracts.Services;

public interface ISplitBraceService
{
IEnumerable<string> SplitBrace(string text);

IEnumerable<string> SplitBrace(IEnumerable<string> texts);
}
95 changes: 48 additions & 47 deletions Epub/KoeBook.Epub/Services/ScrapingAozoraService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using Microsoft.Extensions.DependencyInjection;
using static KoeBook.Epub.Utility.ScrapingHelper;


namespace KoeBook.Epub.Services
{
public partial class ScrapingAozoraService([FromKeyedServices(nameof(ScrapingAozoraService))] IScrapingClientService scrapingClientService) : IScrapingService
public partial class ScrapingAozoraService(ISplitBraceService splitBraceService, [FromKeyedServices(nameof(ScrapingAozoraService))] IScrapingClientService scrapingClientService) : IScrapingService
{
private readonly ISplitBraceService _splitBraceService = splitBraceService;
private readonly IScrapingClientService _scrapingClientService = scrapingClientService;


public bool IsMatchSite(Uri uri)
{
return uri.Host == "www.aozora.gr.jp";
Expand Down Expand Up @@ -175,7 +176,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
paragraph.Text += TextProcess(midashi);
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());

foreach (var splitText in SplitBrace(TextProcess(midashi)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(midashi)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand All @@ -192,20 +193,21 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
{
// https://www.aozora.gr.jp/annotation/graphics.html#:~:text=%3Cdiv%20class%3D%22caption%22%3E を処理するための部分
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements;
if (focusElements[^1] is Paragraph paragraph)
{
var split = SplitBrace(TextProcess(element));
for (int i = 0; i < split.Count - 1; i++)
var splitted = _splitBraceService.SplitBrace(TextProcess(element));
var first = true;

foreach (var text in splitted)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
if (first)
{
paragraph1.Text += split[i];
paragraph.Text += text;
first = false;
}
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());
}
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph2)
{
paragraph2.Text += split[^1];
else
focusElements.Add(new Paragraph() { Text = text });
}
}
}
Expand All @@ -232,7 +234,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand Down Expand Up @@ -343,7 +345,7 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
{
foreach (var splitText in SplitBrace(TextProcess(element)))
foreach (var splitText in _splitBraceService.SplitBrace(TextProcess(element)))
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
{
Expand Down Expand Up @@ -375,21 +377,22 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
}
sectionNum++;
}

document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements;
if (focusElements[^1] is Paragraph paragraph)
{
var split = SplitBrace(TextProcess(element));
for (int i = 0; i < split.Count - 1; i++)
var splitted = _splitBraceService.SplitBrace(TextProcess(element));
var first = true;
foreach (var text in splitted)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
if (first)
{
paragraph1.Text += split[i];
paragraph.Text += text;
first = false;
}
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());
}
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph2)
{
paragraph2.Text += split[^1];
else
focusElements.Add(new Paragraph { Text = text });
}
}
// 想定していない構造が見つかったことをログに出力した方が良い?
Expand All @@ -416,22 +419,20 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
sectionNum++;
}
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements;
if (focusElements[^1] is Paragraph paragraph)
{
paragraph.Text += TextProcess(element);

var split = SplitBrace(TextProcess(element));
for (int i = 0; i < split.Count - 1; i++)
var splitted = _splitBraceService.SplitBrace(TextProcess(element));
var first = true;
foreach (var text in splitted)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
if (first)
{
paragraph1.Text += split[i];
paragraph.Text += text;
first = false;
}
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());
}
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph2)
{
paragraph2.Text += split[^1];
else
focusElements.Add(new Paragraph { Text = text });
}
}
// 想定していない構造が見つかったことをログに出力した方が良い?
Expand Down Expand Up @@ -464,20 +465,20 @@ public async ValueTask<EpubDocument> ScrapingAsync(string url, string coverFileP
sectionNum++;
}
document.EnsureParagraph(chapterNum, sectionNum);
if ((document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph))
var focusElements = document.Chapters[chapterNum].Sections[sectionNum].Elements;
if (focusElements[^1] is Paragraph paragraph)
{
var split = SplitBrace(TextReplace(nextNode.Text()));
for (int i = 0; i < split.Count - 1; i++)
var splitted = _splitBraceService.SplitBrace(TextReplace(nextNode.Text()));
var first = true;
foreach (var text in splitted)
{
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph1)
if (first)
{
paragraph1.Text += split[i];
paragraph.Text += text;
first = false;
}
document.Chapters[chapterNum].Sections[sectionNum].Elements.Add(new Paragraph());
}
if (document.Chapters[chapterNum].Sections[sectionNum].Elements[^1] is Paragraph paragraph2)
{
paragraph2.Text += split[^1];
else
focusElements.Add(new Paragraph { Text = text });
}
}
}
Expand Down
39 changes: 22 additions & 17 deletions Epub/KoeBook.Epub/Services/ScrapingNaroService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
using AngleSharp.Html.Dom;
using AngleSharp.Io;
using KoeBook.Core;
using KoeBook.Core.Utilities;
using KoeBook.Epub.Contracts.Services;
using KoeBook.Epub.Models;
using Microsoft.Extensions.DependencyInjection;
using static KoeBook.Epub.Utility.ScrapingHelper;

namespace KoeBook.Epub.Services
{
public partial class ScrapingNaroService(IHttpClientFactory httpClientFactory, [FromKeyedServices(nameof(ScrapingNaroService))] IScrapingClientService scrapingClientService) : IScrapingService
public partial class ScrapingNaroService(IHttpClientFactory httpClientFactory, ISplitBraceService splitBraceService, [FromKeyedServices(nameof(ScrapingNaroService))] IScrapingClientService scrapingClientService) : IScrapingService
{
private readonly IHttpClientFactory _httpCliantFactory = httpClientFactory;
private readonly ISplitBraceService _splitBraceService = splitBraceService;
private readonly IScrapingClientService _scrapingClientService = scrapingClientService;

public bool IsMatchSite(Uri uri)
Expand Down Expand Up @@ -135,8 +136,10 @@ public record BookInfo(int? allcount, int? noveltype, int? general_all_no);

private record SectionWithChapterTitle(string? title, Section section);

private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct)
private async ValueTask<SectionWithChapterTitle> ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct)
{
var lineBuilder = new SplittedLineBuilder();

var config = Configuration.Default.WithDefaultLoader();
using var context = BrowsingContext.New(config);
var doc = await context.OpenAsync(url, ct).ConfigureAwait(false);
Expand Down Expand Up @@ -171,7 +174,6 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo

var section = new Section(sectionTitleElement.InnerHtml);


var main_text = doc.QuerySelector("#novel_honbun")
?? throw new EbookException(ExceptionType.WebScrapingFailed, "本文がありません");

Expand All @@ -184,10 +186,7 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
lineBuilder.Append(item.InnerHtml);
}
}
else if (item.ChildElementCount == 1)
Expand Down Expand Up @@ -221,13 +220,17 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
lineBuilder.Append(item.InnerHtml);
}
}
else if (item.Children[0] is not IHtmlBreakRowElement)
else if (item.Children[0] is IHtmlBreakRowElement)
{
foreach (var split in _splitBraceService.SplitBrace(lineBuilder.ToLinesAndClear()))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
else
throw new EbookException(ExceptionType.UnexpectedStructure);
}
else
Expand All @@ -247,16 +250,18 @@ private static async Task<SectionWithChapterTitle> ReadPageAsync(string url, boo

if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
lineBuilder.Append(item.InnerHtml);
}
}
foreach (var split in _splitBraceService.SplitBrace(lineBuilder.ToLinesAndClear()))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
return new SectionWithChapterTitle(chapterTitle, section);
}


[System.Text.RegularExpressions.GeneratedRegex(@"https://.{5,7}.syosetu.com/(.{7}).?")]
private static partial System.Text.RegularExpressions.Regex UrlToNcode();

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
namespace KoeBook.Epub.Utility;
using KoeBook.Epub.Contracts.Services;

public static class ScrapingHelper
namespace KoeBook.Epub.Services;

public class SplitBraceService : ISplitBraceService
{
public static List<string> SplitBrace(string text)
public IEnumerable<string> SplitBrace(string text)
{
if (text.Length == 1 && (text == "「" || text == "『" || text == "」" || text == "』"))
return [text];
// textが空白だった時 paragraph を挿入する処理をスキップ
if (string.IsNullOrWhiteSpace(text))
yield break;

if (text.Length == 1)
{
yield return text;
yield break;
}

var bracket = 0;
var brackets = new int[text.Length];
Expand All @@ -17,28 +26,33 @@ public static List<string> SplitBrace(string text)
brackets[i] = bracket;
}

var result = new List<string>();
var mn = Math.Min(0, brackets.Min());
var startIdx = 0;
for (var i = 0; i < brackets.Length; i++)
{
brackets[i] -= mn;
if ((text[i] == '「' || text[i] == '『') && brackets[i] == 1 && i != 0 && startIdx != i)
{
result.Add(text[startIdx..i]);
yield return text[startIdx..i];
startIdx = i;
}
if ((text[i] == '」' || text[i] == '』') && brackets[i] == 0)
{
result.Add(text[startIdx..(i + 1)]);
yield return text[startIdx..(i + 1)];
startIdx = i + 1;
}
}
if (startIdx != text.Length)
{
result.Add(text[startIdx..]);
yield return text[startIdx..];
}
}

return result;
/// <summary>
/// 複数の文字列を分割して平坦化します。
/// </summary>
public IEnumerable<string> SplitBrace(IEnumerable<string> texts)
{
return texts.SelectMany(SplitBrace);
}
}
23 changes: 23 additions & 0 deletions KoeBook.Core/Utilities/EnumerableEx.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
namespace KoeBook.Core.Utilities;

public static class EnumerableEx
{
public static IEnumerable<(TSource value, bool isFirst, bool isLast)> WithPosition<TSource>(this IEnumerable<TSource> source)
{
using var enumerator = source.GetEnumerator();

var hasNext = enumerator.MoveNext();
if (!hasNext)
yield break;
var current = enumerator.Current;
hasNext = enumerator.MoveNext();
yield return (current, true, !hasNext);

while (hasNext)
{
current = enumerator.Current;
hasNext = enumerator.MoveNext();
yield return (current, false, !hasNext);
}
}
}
Loading

0 comments on commit ec1e0a7

Please sign in to comment.