-
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #54 from ernado-x/content-loaders
Version 2.0.2
- Loading branch information
Showing
29 changed files
with
716 additions
and
360 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,85 +1,24 @@ | ||
using System; | ||
using System.Collections.Concurrent; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Threading.Tasks; | ||
|
||
namespace X.Web.MetaExtractor.Example; | ||
|
||
class Program | ||
{ | ||
static async Task Main(string[] args) | ||
{ | ||
Console.Clear(); | ||
|
||
IReadOnlyCollection<Uri> linksTemplate = new List<Uri> | ||
{ | ||
new Uri("https://diepresse.com/home/wirtschaft/unternehmen/5399476/TeslaChef-Elon-Musk_Das-AutoGeschaeft-ist-die-Hoelle"), | ||
new Uri("https://andrew.gubskiy.com/"), | ||
new Uri("https://devdigest.today/post/458"), | ||
new Uri("https://blogs.msdn.microsoft.com/dotnet/2018/04/11/announcing-net-core-2-1-preview-2/"), | ||
new Uri("https://github.com/dotnet/corefx/milestone/12"), | ||
new Uri("https://stackoverflow.com/questions/49790807/can-net-core-1-1-4-run-net-standard-2"), | ||
new Uri("https://dotnetcoretutorials.com"), | ||
new Uri("https://softwareengineering.stackexchange.com/questions/305933/json-api-specification-when-do-i-need-to-return-a-404-not-found"), | ||
new Uri("https://devdigest.today/post/469"), | ||
new Uri("https://diepresse.com/home/panorama/wien/5386805/Polizist-attackiert_Parlament-verstaerkt-Bewachung"), | ||
new Uri("https://www.diepresse.com/5748483/thiem-unterliegt-bei-atp-cup-gegen-den-polen-hurkacz") | ||
}; | ||
|
||
var links = new List<Uri>(); | ||
|
||
for (var i = 0; i < 1; i++) | ||
{ | ||
links.AddRange(Generate(linksTemplate)); | ||
} | ||
|
||
var extractor = new Extractor(); | ||
|
||
var collection = new BlockingCollection<Metadata>(); | ||
|
||
|
||
await ForEach(links, async uri => | ||
{ | ||
Console.WriteLine($"Start extracting {uri}"); | ||
|
||
try | ||
{ | ||
var metadata = await extractor.ExtractAsync(uri); | ||
collection.Add(metadata); | ||
} | ||
catch (Exception ex) | ||
{ | ||
Console.WriteLine($"Url: {uri}. Message: {ex.Message}"); | ||
} | ||
|
||
return true; | ||
}); | ||
|
||
foreach (var m in collection) | ||
{ | ||
Console.WriteLine($"{m.Title}, {m.Description}"); | ||
} | ||
|
||
Console.Write("OK"); | ||
Console.ReadKey(); | ||
} | ||
|
||
private static IReadOnlyCollection<Uri> Generate(IReadOnlyCollection<Uri> links) | ||
{ | ||
var result = new List<Uri>(); | ||
|
||
foreach (var link in links) | ||
{ | ||
result.Add(new Uri($"{link}?cb={Guid.NewGuid()}")); | ||
} | ||
|
||
return result; | ||
} | ||
|
||
private static Task ForEach<T>(IEnumerable<T> items, Func<T, Task<bool>> action) | ||
{ | ||
var tasks = items.Select(action).ToList(); | ||
return Task.WhenAll(tasks); | ||
} | ||
} | ||
using X.Web.MetaExtractor; | ||
using X.Web.MetaExtractor.ContentLoaders.Flurl; | ||
using X.Web.MetaExtractor.LanguageDetectors; | ||
|
||
IPageContentLoader contentLoader = new FlurlPageContentLoader(); | ||
ILanguageDetector languageDetector = new LanguageDetector(); | ||
string defaultImage = "https://example.com/example.jpg"; | ||
|
||
// Create an instance of the Extractor | ||
IExtractor extractor = new Extractor(defaultImage, contentLoader, languageDetector); | ||
|
||
// Extract meta information from a URL | ||
var uri = new Uri("https://andrew.gubskiy.com/content/item/about"); | ||
var metaInfo = await extractor.ExtractAsync(uri); | ||
|
||
Console.Clear(); | ||
|
||
// Display the extracted meta information | ||
Console.WriteLine($"Url: {metaInfo.Url}"); | ||
Console.WriteLine($"Title: {metaInfo.Title}"); | ||
Console.WriteLine($"Description: {metaInfo.Description}"); | ||
Console.WriteLine($"Keywords: {string.Join(", ", metaInfo.Keywords)}"); | ||
Console.WriteLine($"Image: {metaInfo.Images.FirstOrDefault()}"); | ||
Console.WriteLine($"Language: {metaInfo.Language}"); |
24 changes: 15 additions & 9 deletions
24
example/X.Web.MetaExtractor.Example/X.Web.MetaExtractor.Example.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,16 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
<PropertyGroup> | ||
<OutputType>Exe</OutputType> | ||
<TargetFramework>net6.0</TargetFramework> | ||
<LangVersion>default</LangVersion> | ||
</PropertyGroup> | ||
<ItemGroup> | ||
<ProjectReference Include="..\..\src\X.Web.MetaExtractor\X.Web.MetaExtractor.csproj" /> | ||
</ItemGroup> | ||
</Project> | ||
|
||
<PropertyGroup> | ||
<OutputType>Exe</OutputType> | ||
<TargetFramework>net8.0</TargetFramework> | ||
<ImplicitUsings>enable</ImplicitUsings> | ||
<Nullable>enable</Nullable> | ||
<LangVersion>default</LangVersion> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\..\src\X.Web.MetaExtractor.ContentLoaders.Flurl\X.Web.MetaExtractor.ContentLoaders.Flurl.csproj" /> | ||
<ProjectReference Include="..\..\src\X.Web.MetaExtractor\X.Web.MetaExtractor.csproj" /> | ||
</ItemGroup> | ||
|
||
</Project> |
17 changes: 17 additions & 0 deletions
17
src/X.Web.MetaExtractor.ContentLoaders.Flurl/FlurlPageContentLoader.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
using System; | ||
using System.Threading.Tasks; | ||
using Flurl.Http; | ||
using JetBrains.Annotations; | ||
|
||
namespace X.Web.MetaExtractor.ContentLoaders.Flurl; | ||
|
||
[PublicAPI] | ||
public class FlurlPageContentLoader : IPageContentLoader | ||
{ | ||
public async Task<string> LoadPageContentAsync(Uri uri) | ||
{ | ||
var html = await uri.ToString().GetStringAsync(); | ||
|
||
return html; | ||
} | ||
} |
Oops, something went wrong.