-
Notifications
You must be signed in to change notification settings - Fork 5
/
agent.cpp
138 lines (126 loc) · 3.38 KB
/
agent.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#include <algorithm>
#include <iomanip>
#include <sstream>
#include "url.h"
#include "agent.h"
#include "directive.h"
namespace
{
std::string escape_url(Url::Url& url)
{
return url.defrag().escape().fullpath();
}
std::string trim_front(const std::string& str, const char chr)
{
auto itr = std::find_if(str.begin(), str.end(),
[chr](const char c) {return c != chr;});
return std::string(itr, str.end());
}
}
namespace Rep
{
Agent& Agent::allow(const std::string& query)
{
Url::Url url(query);
// ignore directives for external URLs
if (is_external(url))
{
return *this;
}
// leading wildcard?
if (query.front() == '*')
{
Url::Url trimmed(trim_front(query, '*'));
directives_.push_back(Directive(escape_url(trimmed), true));
}
directives_.push_back(Directive(escape_url(url), true));
sorted_ = false;
return *this;
}
Agent& Agent::disallow(const std::string& query)
{
if (query.empty())
{
// Special case: "Disallow:" means "Allow: /"
directives_.push_back(Directive(query, true));
}
else
{
Url::Url url(query);
// ignore directives for external URLs
if (is_external(url))
{
return *this;
}
// leading wildcard?
if (query.front() == '*')
{
Url::Url trimmed(trim_front(query, '*'));
directives_.push_back(Directive(escape_url(trimmed), false));
}
directives_.push_back(Directive(escape_url(url), false));
}
sorted_ = false;
return *this;
}
const std::vector<Directive>& Agent::directives() const
{
if (!sorted_)
{
std::sort(directives_.begin(), directives_.end(),
[](const Directive& a, const Directive& b) {
return b.priority() < a.priority();
});
sorted_ = true;
}
return directives_;
}
bool Agent::allowed(const std::string& query) const
{
Url::Url url(query);
if (is_external(url))
{
return false;
}
std::string path(escape_url(url));
if (path.compare("/robots.txt") == 0)
{
return true;
}
for (const auto& directive : directives())
{
if (directive.match(path))
{
return directive.allowed();
}
}
return true;
}
std::string Agent::str() const
{
std::stringstream out;
if (delay_ > 0)
{
out << "Crawl-Delay: " << std::setprecision(3) << delay_ << ' ';
}
out << '[';
const auto& d = directives();
auto begin = d.begin();
auto end = d.end();
if (begin != end)
{
out << "Directive(" << begin->str() << ')';
++begin;
}
for (; begin != end; ++begin)
{
out << ", Directive(" << begin->str() << ')';
}
out << ']';
return out.str();
}
bool Agent::is_external(const Url::Url& url) const
{
return !host_.empty() && !url.host().empty() && url.host() != host_;
}
}