DotnetSpider, a .NET Standard web crawling library similar to WebMagic and Scrapy. It is a lightweight ,efficient and fast high-level web crawling & scraping framework for .NET
Visual Studio 2017(15.3 or later)
Storage data to mysql. Download MySql
grant all on *.* to 'root'@'localhost' IDENTIFIED BY '' with grant option; flush privileges;
- Distributed crawler. Download Redis for windows
- SqlServer.
- PostgreSQL.
- MongoDb
Please see the Projet DotnetSpider.Sample in the solution.
public class EntityModelSpider
public static void Run()
Spider spider = new Spider();
private class Spider : EntitySpider
protected override void OnInit(params string[] arguments)
var word = "可乐|雪碧";
AddRequest(string.Format("{0}&tn=news&from=news&cl=2&pn=0&rn=20&ct=1", word), new Dictionary<string, dynamic> { { "Keyword", word } });
AddPipeline(new ConsoleEntityPipeline());
[TableInfo("baidu", "baidu_search_entity_model")]
[EntitySelector(Expression = ".//div[@class='result']", Type = SelectorType.XPath)]
class BaiduSearchEntry : BaseEntity
[Field(Expression = "Keyword", Type = SelectorType.Enviroment)]
public string Keyword { get; set; }
[Field(Expression = ".//h3[@class='c-title']/a")]
[ReplaceFormatter(NewValue = "", OldValue = "<em>")]
[ReplaceFormatter(NewValue = "", OldValue = "</em>")]
public string Title { get; set; }
[Field(Expression = ".//h3[@class='c-title']/a/@href")]
public string Url { get; set; }
[Field(Expression = ".//div/p[@class='c-author']/text()")]
[ReplaceFormatter(NewValue = "-", OldValue = " ")]
public string Website { get; set; }
[Field(Expression = ".//div/span/a[@class='c-cache']/@href")]
public string Snapshot { get; set; }
[Field(Expression = ".//div[@class='c-summary c-row ']", Option = FieldOptions.InnerText)]
[ReplaceFormatter(NewValue = "", OldValue = "<em>")]
[ReplaceFormatter(NewValue = "", OldValue = "</em>")]
[ReplaceFormatter(NewValue = " ", OldValue = " ")]
public string Details { get; set; }
[Field(Expression = ".", Option = FieldOptions.InnerText)]
[ReplaceFormatter(NewValue = "", OldValue = "<em>")]
[ReplaceFormatter(NewValue = "", OldValue = "</em>")]
[ReplaceFormatter(NewValue = " ", OldValue = " ")]
public string PlainText { get; set; }
public static void Main()
Command: -s:[spider type name | TaskName attribute] -i:[identity] -a:[arg1,arg2...] --tid:[taskId] -n:[name] -c:[configuration file path or name]
- -s: Type name of spider or TaskNameAttribute for example: DotnetSpider.Sample.BaiduSearchSpiderl
- -i: Set identity.
- -a: Pass arguments to spider's Run method.
- --tid: Set task id.
- -n: Set name.
- -c: Set config file path, for example you want to run with a customize config:
When you want to collect a page JS loaded, there is only one thing to do, set the downloader to WebDriverDownloader.
Downloader=new WebDriverDownloader(Browser.Chrome);
- Make sure there is a ChromeDriver.exe in bin forlder when you try to use Chrome. You can contain it to your project via NUGET manager: Chromium.ChromeDriver
- Make sure you already add a *.webdriver Firefox profile when you try to use Firefox:
- Make sure there is a PhantomJS.exe in bin folder when you try to use PhantomJS. You can contain it to your project via NUGET manager: PhantomJS
- Dependences a ci platform forexample i used teamcity right now.
- Dependences Sceduler.NET
- More documents continue...
timeout 0
tcp-keepalive 60
QQ Group: 477731655 Email: