Skip to content
This repository has been archived by the owner on May 12, 2020. It is now read-only.

Commit

Permalink
Add HTML Filtering parsing
Browse files Browse the repository at this point in the history
Fix #45
Fix #61

Auditors: @SergeyZhukovsky
  • Loading branch information
bbondy committed Apr 19, 2017
1 parent f612227 commit cb335e7
Show file tree
Hide file tree
Showing 9 changed files with 154 additions and 73 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Changelog

## 3.0.0
- Added support for parsing HTML filtering rules
135 changes: 92 additions & 43 deletions ad_block_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,17 @@ void parseFilter(const char *input, const char *end, Filter *f,
break;

case '$':
if (*(p+1) == '$') {
if (i != 0) {
f->domainList = new char[i + 1];
memcpy(f->domainList, data, i + 1);
i = 0;
}
parseState = FPDataOnly;
f->filterType = FTHTMLFiltering;
p+=2;
continue;
}
f->parseOptions(p + 1);
earlyBreak = true;
continue;
Expand Down Expand Up @@ -360,12 +371,14 @@ void parseFilter(const char *input, const char *end, Filter *f,


AdBlockClient::AdBlockClient() : filters(nullptr),
htmlRuleFilters(nullptr),
cosmeticFilters(nullptr),
htmlFilters(nullptr),
exceptionFilters(nullptr),
noFingerprintFilters(nullptr),
noFingerprintExceptionFilters(nullptr),
numFilters(0),
numHtmlRuleFilters(0),
numCosmeticFilters(0),
numHtmlFilters(0),
numExceptionFilters(0),
numNoFingerprintFilters(0),
numNoFingerprintExceptionFilters(0),
Expand Down Expand Up @@ -393,9 +406,13 @@ void AdBlockClient::clear() {
delete[] filters;
filters = nullptr;
}
if (htmlRuleFilters) {
delete[] htmlRuleFilters;
htmlRuleFilters = nullptr;
if (cosmeticFilters) {
delete[] cosmeticFilters;
cosmeticFilters = nullptr;
}
if (htmlFilters) {
delete[] htmlFilters;
htmlFilters = nullptr;
}
if (exceptionFilters) {
delete[] exceptionFilters;
Expand Down Expand Up @@ -431,7 +448,8 @@ void AdBlockClient::clear() {
}

numFilters = 0;
numHtmlRuleFilters = 0;
numCosmeticFilters = 0;
numHtmlFilters = 0;
numExceptionFilters = 0;
numNoFingerprintFilters = 0;
numNoFingerprintExceptionFilters = 0;
Expand Down Expand Up @@ -735,7 +753,8 @@ bool AdBlockClient::parse(const char *input) {
const char *lineStart = p;

int newNumFilters = 0;
int newNumHtmlRuleFilters = 0;
int newNumCosmeticFilters = 0;
int newNumHtmlFilters = 0;
int newNumExceptionFilters = 0;
int newNumNoFingerprintFilters = 0;
int newNumNoFingerprintExceptionFilters = 0;
Expand Down Expand Up @@ -765,10 +784,13 @@ bool AdBlockClient::parse(const char *input) {
}
break;
case FTElementHiding:
newNumHtmlRuleFilters++;
newNumCosmeticFilters++;
break;
case FTElementHidingException:
newNumHtmlRuleFilters++;
newNumCosmeticFilters++;
break;
case FTHTMLFiltering:
newNumHtmlFilters++;
break;
case FTEmpty:
case FTComment:
Expand Down Expand Up @@ -798,7 +820,8 @@ bool AdBlockClient::parse(const char *input) {
#ifdef PERF_STATS
cout << "Fingerprint size: " << kFingerprintSize << endl;
cout << "Num new filters: " << newNumFilters << endl;
cout << "Num new HTML rule filters: " << newNumHtmlRuleFilters << endl;
cout << "Num new cosmetic filters: " << newNumCosmeticFilters << endl;
cout << "Num new HTML filters: " << newNumHtmlFilters << endl;
cout << "Num new exception filters: " << newNumExceptionFilters << endl;
cout << "Num new no fingerprint filters: "
<< newNumNoFingerprintFilters << endl;
Expand All @@ -811,8 +834,10 @@ bool AdBlockClient::parse(const char *input) {
#endif

Filter *newFilters = new Filter[newNumFilters + numFilters];
Filter *newHtmlRuleFilters =
new Filter[newNumHtmlRuleFilters + numHtmlRuleFilters];
Filter *newCosmeticFilters =
new Filter[newNumCosmeticFilters + numCosmeticFilters];
Filter *newHtmlFilters =
new Filter[newNumHtmlFilters + numHtmlFilters];
Filter *newExceptionFilters =
new Filter[newNumExceptionFilters + numExceptionFilters];
Filter *newNoFingerprintFilters =
Expand All @@ -823,8 +848,10 @@ bool AdBlockClient::parse(const char *input) {

memset(newFilters, 0,
sizeof(Filter) * (newNumFilters + numFilters));
memset(newHtmlRuleFilters, 0,
sizeof(Filter) * (newNumHtmlRuleFilters + numHtmlRuleFilters));
memset(newCosmeticFilters, 0,
sizeof(Filter) * (newNumCosmeticFilters + numCosmeticFilters));
memset(newHtmlFilters, 0,
sizeof(Filter) * (newNumHtmlFilters + numHtmlFilters));
memset(newExceptionFilters, 0,
sizeof(Filter) * (newNumExceptionFilters + numExceptionFilters));
memset(newNoFingerprintFilters, 0,
Expand All @@ -834,19 +861,22 @@ bool AdBlockClient::parse(const char *input) {
+ numNoFingerprintExceptionFilters));

Filter *curFilters = newFilters;
Filter *curHtmlRuleFilters = newHtmlRuleFilters;
Filter *curCosmeticFilters = newCosmeticFilters;
Filter *curHtmlFilters = newHtmlFilters;
Filter *curExceptionFilters = newExceptionFilters;
Filter *curNoFingerprintFilters = newNoFingerprintFilters;
Filter *curNoFingerprintExceptionFilters = newNoFingerprintExceptionFilters;

// If we've had a parse before copy the old data into the new data structure
if (filters || htmlRuleFilters || exceptionFilters || noFingerprintFilters
|| noFingerprintExceptionFilters
if (filters || cosmeticFilters || htmlFilters || exceptionFilters ||
noFingerprintFilters || noFingerprintExceptionFilters
/*|| hostAnchoredFilters || hostAnchoredExceptionFilters */) {
// Copy the old data in
memcpy(newFilters, filters, sizeof(Filter) * numFilters);
memcpy(newHtmlRuleFilters, htmlRuleFilters,
sizeof(Filter) * numHtmlRuleFilters);
memcpy(newCosmeticFilters, cosmeticFilters,
sizeof(Filter) * numCosmeticFilters);
memcpy(newHtmlFilters, htmlFilters,
sizeof(Filter) * numHtmlFilters);
memcpy(newExceptionFilters, exceptionFilters,
sizeof(Filter) * numExceptionFilters);
memcpy(newNoFingerprintFilters, noFingerprintFilters,
Expand All @@ -858,28 +888,32 @@ bool AdBlockClient::parse(const char *input) {
// Set the old filter lists borrwedMemory to true since it'll be taken by
// the new filters.
setFilterBorrowedMemory(filters, numFilters);
setFilterBorrowedMemory(htmlRuleFilters, numHtmlRuleFilters);
setFilterBorrowedMemory(cosmeticFilters, numCosmeticFilters);
setFilterBorrowedMemory(htmlFilters, numHtmlFilters);
setFilterBorrowedMemory(exceptionFilters, numExceptionFilters);
setFilterBorrowedMemory(noFingerprintFilters, numNoFingerprintFilters);
setFilterBorrowedMemory(noFingerprintExceptionFilters,
numNoFingerprintExceptionFilters);
delete[] filters;
delete[] htmlRuleFilters;
delete[] cosmeticFilters;
delete[] htmlFilters;
delete[] exceptionFilters;
delete[] noFingerprintFilters;
delete[] noFingerprintExceptionFilters;

// Adjust the current pointers to be just after the copied in data
curFilters += numFilters;
curHtmlRuleFilters += numHtmlRuleFilters;
curCosmeticFilters += numCosmeticFilters;
curHtmlFilters += numHtmlFilters;
curExceptionFilters += numExceptionFilters;
curNoFingerprintFilters += numNoFingerprintFilters;
curNoFingerprintExceptionFilters += numNoFingerprintExceptionFilters;
}

// And finally update with the new counts
numFilters += newNumFilters;
numHtmlRuleFilters += newNumHtmlRuleFilters;
numCosmeticFilters += newNumCosmeticFilters;
numHtmlFilters += newNumHtmlFilters;
numExceptionFilters += newNumExceptionFilters;
numNoFingerprintFilters += newNumNoFingerprintFilters;
numNoFingerprintExceptionFilters += newNumNoFingerprintExceptionFilters;
Expand All @@ -888,7 +922,8 @@ bool AdBlockClient::parse(const char *input) {

// Adjust the new member list pointers
filters = newFilters;
htmlRuleFilters = newHtmlRuleFilters;
cosmeticFilters = newCosmeticFilters;
htmlFilters = newHtmlFilters;
exceptionFilters = newExceptionFilters;
noFingerprintFilters = newNoFingerprintFilters;
noFingerprintExceptionFilters = newNoFingerprintExceptionFilters;
Expand Down Expand Up @@ -918,8 +953,12 @@ bool AdBlockClient::parse(const char *input) {
break;
case FTElementHiding:
case FTElementHidingException:
(*curHtmlRuleFilters).swapData(&f);
curHtmlRuleFilters++;
(*curCosmeticFilters).swapData(&f);
curCosmeticFilters++;
break;
case FTHTMLFiltering:
(*curHtmlFilters).swapData(&f);
curHtmlFilters++;
break;
case FTEmpty:
case FTComment:
Expand Down Expand Up @@ -1003,9 +1042,13 @@ int serializeFilters(char * buffer, size_t bufferSizeAvail,
}

// Returns a newly allocated buffer, caller must manually delete[] the buffer
char * AdBlockClient::serialize(int *totalSize, bool ignoreHTMLFilters) {
char * AdBlockClient::serialize(int *totalSize,
bool ignoreCosmeticFilters,
bool ignoreHtmlFilters) {
*totalSize = 0;
int adjustedNumHTMLFilters = ignoreHTMLFilters ? 0 : numHtmlRuleFilters;
int adjustedNumCosmeticFilters =
ignoreCosmeticFilters ? 0 : numCosmeticFilters;
int adjustedNumHtmlFilters = ignoreHtmlFilters ? 0 : numHtmlFilters;

uint32_t hostAnchoredHashSetSize = 0;
char *hostAnchoredHashSetBuffer = nullptr;
Expand All @@ -1025,16 +1068,17 @@ char * AdBlockClient::serialize(int *totalSize, bool ignoreHTMLFilters) {
// Get the number of bytes that we'll need
char sz[512];
*totalSize += 1 + snprintf(sz, sizeof(sz),
"%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x", numFilters,
numExceptionFilters, adjustedNumHTMLFilters, numNoFingerprintFilters,
numNoFingerprintExceptionFilters, numHostAnchoredFilters,
numHostAnchoredExceptionFilters,
"%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x", numFilters,
numExceptionFilters, adjustedNumCosmeticFilters, adjustedNumHtmlFilters,
numNoFingerprintFilters, numNoFingerprintExceptionFilters,
numHostAnchoredFilters, numHostAnchoredExceptionFilters,
bloomFilter ? bloomFilter->getByteBufferSize() : 0, exceptionBloomFilter
? exceptionBloomFilter->getByteBufferSize() : 0,
hostAnchoredHashSetSize, hostAnchoredExceptionHashSetSize);
*totalSize += serializeFilters(nullptr, 0, filters, numFilters) +
serializeFilters(nullptr, 0, exceptionFilters, numExceptionFilters) +
serializeFilters(nullptr, 0, htmlRuleFilters, adjustedNumHTMLFilters) +
serializeFilters(nullptr, 0, cosmeticFilters, adjustedNumCosmeticFilters) +
serializeFilters(nullptr, 0, htmlFilters, adjustedNumHtmlFilters) +
serializeFilters(nullptr, 0,
noFingerprintFilters, numNoFingerprintFilters) +
serializeFilters(nullptr, 0, noFingerprintExceptionFilters,
Expand All @@ -1056,8 +1100,10 @@ char * AdBlockClient::serialize(int *totalSize, bool ignoreHTMLFilters) {
pos += serializeFilters(buffer + pos, *totalSize - pos, filters, numFilters);
pos += serializeFilters(buffer + pos, *totalSize - pos,
exceptionFilters, numExceptionFilters);
pos += serializeFilters(buffer + pos, *totalSize - pos, htmlRuleFilters,
adjustedNumHTMLFilters);
pos += serializeFilters(buffer + pos, *totalSize - pos, cosmeticFilters,
adjustedNumCosmeticFilters);
pos += serializeFilters(buffer + pos, *totalSize - pos, htmlFilters,
adjustedNumHtmlFilters);
pos += serializeFilters(buffer + pos, *totalSize - pos, noFingerprintFilters,
numNoFingerprintFilters);
pos += serializeFilters(buffer + pos, *totalSize - pos,
Expand Down Expand Up @@ -1131,25 +1177,28 @@ bool AdBlockClient::deserialize(char *buffer) {
int bloomFilterSize = 0, exceptionBloomFilterSize = 0,
hostAnchoredHashSetSize = 0, hostAnchoredExceptionHashSetSize = 0;
int pos = 0;
sscanf(buffer + pos, "%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x", &numFilters,
&numExceptionFilters, &numHtmlRuleFilters, &numNoFingerprintFilters,
&numNoFingerprintExceptionFilters, &numHostAnchoredFilters,
&numHostAnchoredExceptionFilters, &bloomFilterSize,
&exceptionBloomFilterSize, &hostAnchoredHashSetSize,
&hostAnchoredExceptionHashSetSize);
sscanf(buffer + pos, "%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x", &numFilters,
&numExceptionFilters, &numCosmeticFilters, &numHtmlFilters,
&numNoFingerprintFilters, &numNoFingerprintExceptionFilters,
&numHostAnchoredFilters, &numHostAnchoredExceptionFilters,
&bloomFilterSize, &exceptionBloomFilterSize,
&hostAnchoredHashSetSize, &hostAnchoredExceptionHashSetSize);
pos += static_cast<int>(strlen(buffer + pos)) + 1;

filters = new Filter[numFilters];
exceptionFilters = new Filter[numExceptionFilters];
htmlRuleFilters = new Filter[numHtmlRuleFilters];
cosmeticFilters = new Filter[numCosmeticFilters];
htmlFilters = new Filter[numHtmlFilters];
noFingerprintFilters = new Filter[numNoFingerprintFilters];
noFingerprintExceptionFilters = new Filter[numNoFingerprintExceptionFilters];

pos += deserializeFilters(buffer + pos, filters, numFilters);
pos += deserializeFilters(buffer + pos,
exceptionFilters, numExceptionFilters);
pos += deserializeFilters(buffer + pos,
htmlRuleFilters, numHtmlRuleFilters);
cosmeticFilters, numCosmeticFilters);
pos += deserializeFilters(buffer + pos,
htmlFilters, numHtmlFilters);
pos += deserializeFilters(buffer + pos,
noFingerprintFilters, numNoFingerprintFilters);
pos += deserializeFilters(buffer + pos,
Expand Down
10 changes: 7 additions & 3 deletions ad_block_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ class AdBlockClient {
Filter **matchingExceptionFilter);
// Serializes a the parsed data and bloom filter data into a single buffer.
// The returned buffer should be deleted.
char * serialize(int *size, bool ignoreHTMLFilters = true);
char * serialize(int *size,
bool ignoreCosmeticFilters = true,
bool ignoreHtmlFilters = true);
// Deserializes the buffer, a size is not needed since a serialized.
// buffer is self described
bool deserialize(char *buffer);
Expand All @@ -40,13 +42,15 @@ class AdBlockClient {
}

Filter *filters;
Filter *htmlRuleFilters;
Filter *cosmeticFilters;
Filter *htmlFilters;
Filter *exceptionFilters;
Filter *noFingerprintFilters;
Filter *noFingerprintExceptionFilters;

int numFilters;
int numHtmlRuleFilters;
int numCosmeticFilters;
int numHtmlFilters;
int numExceptionFilters;
int numNoFingerprintFilters;
int numNoFingerprintExceptionFilters;
Expand Down
4 changes: 2 additions & 2 deletions ad_block_client_wrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,8 @@ void AdBlockClientWrap::GetParsingStats(
Local<Object> stats = Object::New(isolate);
stats->Set(String::NewFromUtf8(isolate, "numFilters"),
Int32::New(isolate, obj->numFilters));
stats->Set(String::NewFromUtf8(isolate, "numHtmlRuleFilters"),
Int32::New(isolate, obj->numHtmlRuleFilters));
stats->Set(String::NewFromUtf8(isolate, "numCosmeticFilters"),
Int32::New(isolate, obj->numCosmeticFilters));
stats->Set(String::NewFromUtf8(isolate, "numExceptionFilters"),
Int32::New(isolate, obj->numExceptionFilters));
stats->Set(String::NewFromUtf8(isolate, "numNoFingerprintFilters"),
Expand Down
1 change: 1 addition & 0 deletions bad_fingerprints.h
Original file line number Diff line number Diff line change
Expand Up @@ -7140,6 +7140,7 @@ const char *badFingerprints[] = {
"sp.com",
"com/w/",
"om/w/1",
"m/w/1.",
};

const char *badSubstrings[] = {"http", "www" };
4 changes: 3 additions & 1 deletion filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ enum FilterType {
FTException = 0200,
FTEmpty = 0400,
FTHostOnly = 01000,
FTHTMLFiltering = 02000,
FTListTypesMask = FTException|FTElementHiding|
FTElementHidingException|FTEmpty|FTComment,
FTElementHidingException|FTEmpty|FTComment|
FTHTMLFiltering,
};

enum FilterOption {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "ad-block",
"main": "./build/Release/ad-block",
"version": "2.1.0",
"version": "3.0.0",
"description": "Ad block engine used in the Brave browser for ABP filter syntax based lists like EasyList.",
"directories": {
"test": "test"
Expand Down
2 changes: 1 addition & 1 deletion scripts/uploadDataFiles.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ const fs = require('fs')
const s3 = require('s3')
const commander = require('commander')
const path = require('path')
const dataFileVersion = 2
const dataFileVersion = 3

const client = s3.createClient({
maxAsyncS3: 20,
Expand Down
Loading

0 comments on commit cb335e7

Please sign in to comment.