Skip to content

Commit

Permalink
feat: strip prefixes and suffixes in links (#5486)
Browse files Browse the repository at this point in the history
  • Loading branch information
Nerixyz authored Jul 14, 2024
1 parent 973b7a3 commit b9f669d
Show file tree
Hide file tree
Showing 10 changed files with 281 additions and 89 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
- Minor: Improve appearance of reply button. (#5491)
- Minor: Introduce HTTP API for plugins. (#5383, #5492, #5494)
- Minor: Support more Firefox variants for incognito link opening. (#5503)
- Minor: Links can now have prefixes and suffixes such as parentheses. (#5486)
- Bugfix: Fixed tab move animation occasionally failing to start after closing a tab. (#5426)
- Bugfix: If a network request errors with 200 OK, Qt's error code is now reported instead of the HTTP status. (#5378)
- Bugfix: Fixed restricted users usernames not being clickable. (#5405)
Expand Down
15 changes: 7 additions & 8 deletions benchmarks/src/LinkParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,16 @@
#include <QString>
#include <QStringList>

#include <optional>

using namespace chatterino;

const QString INPUT = QStringLiteral(
"If your Chatterino isn't loading FFZ emotes, update to the latest nightly "
"(or 2.4.2 if its out) "
"https://github.com/Chatterino/chatterino2/releases/tag/nightly-build "
"AlienPls https://www.youtube.com/watch?v=ELBBiBDcWc0 "
"127.0.3 aaaa xd 256.256.256.256 AsdQwe xd 127.0.0.1 https://. https://.be "
"https://a http://a.b https://a.be ftp://xdd.com "
"127.0.3 aaaa xd 256.256.256.256 AsdQwe xd 127.0.0.1 https://. "
"*https://.be "
"https://a: http://a.b (https://a.be) ftp://xdd.com "
"this is a text lol . ://foo.com //aa.de :/foo.de xd.XDDDDDD ");

static void BM_LinkParsing(benchmark::State &state)
Expand All @@ -24,15 +23,15 @@ static void BM_LinkParsing(benchmark::State &state)

// Make sure the TLDs are loaded
{
benchmark::DoNotOptimize(LinkParser("xd.com").result());
benchmark::DoNotOptimize(linkparser::parse("xd.com"));
}

for (auto _ : state)
{
for (auto word : words)
for (const auto &word : words)
{
LinkParser parser(word);
benchmark::DoNotOptimize(parser.result());
auto parsed = linkparser::parse(word);
benchmark::DoNotOptimize(parsed);
}
}
}
Expand Down
78 changes: 58 additions & 20 deletions src/common/LinkParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,19 +113,58 @@ bool startsWithPort(QStringView string)
return true;
}

/// @brief Strips ignored characters off @a source
///
/// As per https://github.github.com/gfm/#autolinks-extension-:
///
/// '<', '*', '_', '~', and '(' are ignored at the beginning
/// '>', '?', '!', '.', ',', ':', '*', '~', and ')' are ignored at the end
///
/// A difference to GFM is that the source isn't scanned for parentheses and '_'
/// isn't a valid suffix.
void strip(QStringView &source)
{
while (!source.isEmpty())
{
auto c = source.first();
if (c == u'<' || c == u'*' || c == u'_' || c == u'~' || c == u'(')
{
source = source.mid(1);
continue;
}
break;
}

while (!source.isEmpty())
{
auto c = source.last();
if (c == u'>' || c == u'?' || c == u'!' || c == u'.' || c == u',' ||
c == u':' || c == u'*' || c == u'~' || c == u')')
{
source.chop(1);
continue;
}
break;
}
}

} // namespace

namespace chatterino {
namespace chatterino::linkparser {

LinkParser::LinkParser(const QString &unparsedString)
std::optional<Parsed> parse(const QString &source) noexcept
{
ParsedLink result;
std::optional<Parsed> result;
// This is not implemented with a regex to increase performance.
QStringView remaining(unparsedString);
QStringView protocol(remaining);

QStringView link{source};
strip(link);

QStringView remaining = link;
QStringView protocol;

// Check protocol for https?://
if (remaining.startsWith(QStringLiteral("http"), Qt::CaseInsensitive) &&
if (remaining.startsWith(u"http", Qt::CaseInsensitive) &&
remaining.length() >= 4 + 3 + 1) // 'http' + '://' + [any]
{
// optimistic view assuming there's a protocol (http or https)
Expand All @@ -136,11 +175,11 @@ LinkParser::LinkParser(const QString &unparsedString)
withProto = withProto.mid(1);
}

if (withProto.startsWith(QStringLiteral("://")))
if (withProto.startsWith(u"://"))
{
// there's really a protocol => consume it
remaining = withProto.mid(3);
result.protocol = {protocol.begin(), remaining.begin()};
protocol = {link.begin(), remaining.begin()};
}
}

Expand All @@ -161,7 +200,7 @@ LinkParser::LinkParser(const QString &unparsedString)
{
if (lastWasDot) // no double dots ..
{
return;
return result;
}
lastDotPos = i;
lastWasDot = true;
Expand All @@ -181,7 +220,7 @@ LinkParser::LinkParser(const QString &unparsedString)

if (!startsWithPort(remaining))
{
return;
return result;
}

break;
Expand All @@ -198,23 +237,22 @@ LinkParser::LinkParser(const QString &unparsedString)

if (lastWasDot || lastDotPos <= 0)
{
return;
return result;
}

// check host/tld
if ((nDots == 3 && isValidIpv4(host)) ||
isValidTld(host.mid(lastDotPos + 1)))
{
result.host = host;
result.rest = rest;
result.source = unparsedString;
this->result_ = std::move(result);
result = Parsed{
.protocol = protocol,
.host = host,
.rest = rest,
.link = link,
};
}
}

const std::optional<ParsedLink> &LinkParser::result() const
{
return this->result_;
return result;
}

} // namespace chatterino
} // namespace chatterino::linkparser
119 changes: 101 additions & 18 deletions src/common/LinkParser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,43 +4,126 @@

#include <optional>

namespace chatterino {
namespace chatterino::linkparser {

struct ParsedLink {
/// @brief Represents a parsed link
///
/// A parsed link is represented as views over the source string for its
/// different segments. In this simplified model, a link consists of an optional
/// @a protocol, a mandatory @a host and an optional @a rest. These segments are
/// always next to eachother in the input string, however together, they don't
/// span the whole input as it could contain prefixes or suffixes.
///
/// Prefixes and suffixes are almost identical to the ones in GitHub Flavored
/// Markdown (GFM - https://github.github.com/gfm/#autolinks-extension-).
/// The main differences are that '_' isn't a valid suffix and parentheses
/// aren't counted (e.g. "(a.com/(foo)! would result in "a.com/(foo").
/// Matching is done case insensitive (e.g. "HTTp://a.com" would be valid).
///
/// A @a protocol can either be empty, "http://", or "https://".
/// A @a host can either be an IPv4 address or a hostname. The hostname must end
/// in a valid top level domain. Otherwise, there are no restrictions on it.
/// The @a rest can start with an optional port followed by either a '/', '?',
/// or '#'.
///
/// @b Example
///
/// ```text
/// (https://wiki.chatterino.com/Help/#overview)
/// ▏▏proto ▕ host ▏ rest ▏▏
/// ▏▏ link ▏▏
/// ▏ source ▏
/// ```
struct Parsed {
/// The parsed protocol of the link. Can be empty.
///
/// ```text
/// https://www.forsen.tv/commands
/// ^------^
/// ▏╌╌╌╌╌╌▕
///
/// www.forsen.tv/commands
/// (empty)
/// ```
QStringView protocol;

/// The parsed host of the link. Can not be empty.
///
/// ```text
/// https://www.forsen.tv/commands
/// ^-----------^
/// ▏╌╌╌╌╌╌╌╌╌╌╌▕
/// ```
QStringView host;

/// The remainder of the link. Can be empty.
///
/// ```text
/// https://www.forsen.tv/commands
/// ^-------^
/// ▏╌╌╌╌╌╌╌▕
///
/// https://www.forsen.tv
/// (empty)
/// ```
QStringView rest;

/// The original unparsed link.
/// The matched link. Can not be empty.
///
/// https://www.forsen.tv/commands
/// ^----------------------------^
QString source;
};
/// ```text
/// (https://www.forsen.tv/commands)
/// ▏╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌▕
/// ```
QStringView link;

/// Checks if the parsed link contains a prefix
bool hasPrefix(const QString &source) const noexcept
{
return this->link.begin() != source.begin();
}

class LinkParser
{
public:
explicit LinkParser(const QString &unparsedString);
/// The prefix before the parsed link inside @a source. May be empty.
///
/// ```text
/// (https://www.forsen.tv/commands)
/// ^
///
/// https://www.forsen.tv/commands
/// (empty)
/// ```
QStringView prefix(const QString &source) const noexcept
{
return {source.data(), this->link.begin()};
}

const std::optional<ParsedLink> &result() const;
/// Checks if the parsed link contains a suffix
bool hasSuffix(const QString &source) const noexcept
{
return this->link.end() != source.end();
}

private:
std::optional<ParsedLink> result_{};
/// The suffix after the parsed link inside @a source. May be empty.
///
/// ```text
/// (https://www.forsen.tv/commands)
/// ^
///
/// https://www.forsen.tv/commands
/// (empty)
/// ```
QStringView suffix(const QString &source) const noexcept
{
return {
this->link.begin() + this->link.size(),
source.data() + source.length(),
};
}
};

} // namespace chatterino
/// @brief Parses a link from @a source into its segments
///
/// If no link is contained in @a source, `std::nullopt` will be returned.
/// The returned value is valid as long as @a source exists, as it contains
/// views into @a source.
///
/// For the accepted links, see Parsed.
std::optional<Parsed> parse(const QString &source) noexcept;

} // namespace chatterino::linkparser
6 changes: 3 additions & 3 deletions src/controllers/commands/builtin/twitch/SendWhisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,10 @@ bool appendWhisperMessageWordsLocally(const QStringList &words)
void operator()(const QString &string,
MessageBuilder &b) const
{
LinkParser parser(string);
if (parser.result())
auto link = linkparser::parse(string);
if (link)
{
b.addLink(*parser.result());
b.addLink(*link, string);
}
else
{
Expand Down
Loading

0 comments on commit b9f669d

Please sign in to comment.