Skip to content
This repository has been archived by the owner on Jul 26, 2024. It is now read-only.

Commit

Permalink
feat: Add initial filtering (#51)
Browse files Browse the repository at this point in the history
* feat: Add initial filtering

This allows strict host filtering based on a config setting 

`--adm_settings` / `CONTILE_ADM_SETTINGS` accepts a JSON formatted hash of values. The argument should be flattened, but it is expanded below for legibility:

```javascript
CONTILE_ADM_SETTINGS='{
"Example": {   /* The partner name (while case insensitive, it should match the 'name' field response */
   "advertiser_hosts": ["example.com", "www.example.com"], /* Allowed host names for `advertiser_url` */
   "position": 0, /* The tile position suggestion */
   "include_regions": ["en", "en-US/TX"], /* valid region codes for this tile. */
},
"Default": { /* A default set of values. These are used if no value is specified for an individual partner */
   "impression_hosts": ["example.org", "www.example.org"], /* valid hosts for the `impression_url` */
   "click_hosts": ["example.net", "click.foo.example.net"], /* valid hosts for `click_url` */
}}
```

This PR also:
* wires up location information
* Tracks anominal partner info 


Closes #18, #50
Issue #36, #22
  • Loading branch information
jrconlin authored May 4, 2021
1 parent 8667bb6 commit 8007794
Show file tree
Hide file tree
Showing 9 changed files with 552 additions and 83 deletions.
275 changes: 253 additions & 22 deletions src/adm.rs
Original file line number Diff line number Diff line change
@@ -1,43 +1,287 @@
use serde::{Deserialize, Serialize};
use std::{collections::HashMap, fmt::Debug};
use url::Url;

use crate::error::HandlerError;
use crate::error::{HandlerError, HandlerErrorKind, HandlerResult};
use crate::server::location::LocationResult;
use crate::server::ServerState;
use crate::settings::Settings;
use crate::tags::Tags;
use crate::web::middleware::sentry as l_sentry;
//use crate::server::img_storage;

pub(crate) const DEFAULT: &str = "DEFAULT";

#[derive(Debug, Deserialize, Serialize)]
pub struct AdmTileResponse {
pub tiles: Vec<AdmTile>,
}

#[derive(Debug, Deserialize, Serialize)]
/// Filter criteria for adm Tiles
/// Each "filter" is a set of [AdmAdvertiserFilterSettings] that are
/// specific to a given Advertiser name (the names are matched against
/// the tiles fetch request)
/// In addition there is a special [DEFAULT] value which is a filter
/// that will be applied to all advertisers that do not supply their
/// own values.
#[derive(Default, Clone, Debug)]
pub struct AdmFilter {
pub filter_set: HashMap<String, AdmAdvertiserFilterSettings>,
}

/// The AdmAdvertiserFilterSettings contain the settings for the various
/// ADM provided partners. These are specified as a JSON formatted hash
/// that contains the components. A special "DEFAULT" setting provides
/// information that may be used as a DEFAULT, or commonly appearing set
/// of data.
/// See `impl From<Settings>` for details of the structure.
#[derive(Clone, Debug, Deserialize, Default, Serialize)]
pub struct AdmAdvertiserFilterSettings {
/// Set of valid hosts for the `advertiser_url`
pub(crate) advertiser_hosts: Vec<String>,
/// Set of valid hosts for the `impression_url`
pub(crate) impression_hosts: Vec<String>,
/// Set of valid hosts for the `click_url`
pub(crate) click_hosts: Vec<String>,
/// valid position for the tile
pub(crate) position: Option<u8>,
/// Set of valid regions for the tile (e.g ["en", "en-US/TX"])
pub(crate) include_regions: Vec<String>,
}

pub(crate) type AdmSettings = HashMap<String, AdmAdvertiserFilterSettings>;

impl From<&Settings> for AdmSettings {
fn from(settings: &Settings) -> Self {
if settings.adm_settings.is_empty() {
return Self::default();
}
serde_json::from_str(&settings.adm_settings).expect("Invalid ADM Settings")
}
}

/// Check that a given URL is valid according to it's corresponding filter
fn check_url(
url: &str,
species: &'static str,
filter: &[String],
tags: &mut Tags,
) -> HandlerResult<()> {
let parsed: Url = match url.parse() {
Ok(v) => v,
Err(e) => {
tags.add_tag("type", species);
tags.add_extra("parse_error", &e.to_string());
tags.add_extra("url", &url);
return Err(HandlerErrorKind::InvalidHost(species, url.to_string()).into());
}
};
let host = match parsed.host() {
Some(v) => v.to_string(),
None => {
tags.add_tag("type", species);
tags.add_extra("url", &url);
return Err(HandlerErrorKind::MissingHost(species, parsed.to_string()).into());
}
};
if !filter.contains(&host) {
tags.add_tag("type", species);
tags.add_extra("url", &url);
return Err(HandlerErrorKind::UnexpectedHost(species, host).into());
}
Ok(())
}

impl AdmFilter {
/// Report the error directly to sentry
fn report(&self, error: &HandlerError, tags: &Tags) {
// dbg!(&error, &tags);
// TODO: if not error.is_reportable, just add to metrics.
l_sentry::report(tags, sentry::event_from_error(error));
}

/// Check the advertiser URL
fn check_advertiser(
&self,
filter: &AdmAdvertiserFilterSettings,
tile: &mut AdmTile,
tags: &mut Tags,
) -> HandlerResult<()> {
check_url(
&tile.advertiser_url,
"Advertiser",
&filter.advertiser_hosts,
tags,
)
}

/// Check the click URL
fn check_click(
&self,
filter: &AdmAdvertiserFilterSettings,
tile: &mut AdmTile,
tags: &mut Tags,
) -> HandlerResult<()> {
check_url(&tile.click_url, "Click", &filter.click_hosts, tags)
}

/// Check the impression URL to see if it's valid.
///
/// This extends `filter_and_process`
fn check_impression(
&self,
filter: &AdmAdvertiserFilterSettings,
tile: &mut AdmTile,
tags: &mut Tags,
) -> HandlerResult<()> {
check_url(
&tile.impression_url,
"Impression",
&filter.impression_hosts,
tags,
)
}

/// Filter and process tiles from ADM:
///
/// - Returns None for tiles that shouldn't be shown to the client
/// - Modifies tiles for output to the client (adding additional fields, etc.)
pub fn filter_and_process(&self, mut tile: AdmTile, tags: &mut Tags) -> Option<AdmTile> {
// Use strict matching for now, eventually, we may want to use backwards expanding domain
// searches, (.e.g "xyz.example.com" would match "example.com")
match self.filter_set.get(&tile.name.to_lowercase()) {
Some(filter) => {
// Apply any additional tile filtering here.
let none = AdmAdvertiserFilterSettings::default();
let default = self
.filter_set
.get(&DEFAULT.to_lowercase())
.unwrap_or(&none);
// if the filter doesn't have anything defined, try using what's in the default.
// Sadly, `vec.or()` doesn't exist, so do this a bit "long hand"
let adv_filter = if filter.advertiser_hosts.is_empty() {
default
} else {
filter
};
let impression_filter = if filter.impression_hosts.is_empty() {
default
} else {
filter
};
let click_filter = if filter.click_hosts.is_empty() {
default
} else {
filter
};
if let Err(e) = self.check_advertiser(adv_filter, &mut tile, tags) {
self.report(&e, tags);
return None;
}
if let Err(e) = self.check_click(click_filter, &mut tile, tags) {
self.report(&e, tags);
return None;
}
if let Err(e) = self.check_impression(impression_filter, &mut tile, tags) {
self.report(&e, tags);
return None;
}
// Use the default.position (Option<u8>) if the filter.position (Option<u8>) isn't
// defined. In either case `None` is a valid return, but we should favor `filter` over
// `default`.
tile.position = filter.position.or(default.position);
Some(tile)
}
None => {
self.report(
&HandlerErrorKind::UnexpectedAdvertiser(tile.name).into(),
tags,
);
None
}
}
}
}

/// Construct the AdmFilter from the provided settings.
/// This uses a JSON construct of settings, e.g.
/// ```javascript
/// /* for the Example Co advertiser... */
/// {"Example": {
/// /* The allowed hosts for URLs */
/// "advertiser_hosts": ["www.example.org", "example.org"],
/// /* Valid tile positions for this advertiser (empty for "all") */
/// "positions": 1,
/// /* Valid target regions for this advertiser
/// (use "en-US" for "all in english speaking United States") */
/// "include_regions": ["en-US/TX", "en-US/CA"],
/// /* Allowed hosts for impression URLs.
/// Empty means to use the impression URLs in "DEFAULT" */
/// "impression_hosts: [],
/// },
/// ...,
/// "DEFAULT": {
/// /* The default impression URL host to check for. */
/// "impression_hosts": ["example.net"]
/// }
/// }
/// ```
///
impl From<&Settings> for HandlerResult<AdmFilter> {
fn from(settings: &Settings) -> Self {
let mut filter_map: HashMap<String, AdmAdvertiserFilterSettings> = HashMap::new();
for (adv, setting) in AdmSettings::from(settings) {
dbg!("Processing records for {:?}", &adv);
// map the settings to the URL we're going to be checking
filter_map.insert(adv.to_lowercase(), setting);
}
Ok(AdmFilter {
filter_set: filter_map,
})
}
}

#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct AdmTile {
pub id: u64,
pub name: String,
pub advertiser_url: String,
pub click_url: String,
pub image_url: String,
pub impression_url: String,
pub position: Option<u8>,
}

pub async fn get_tiles(
reqwest_client: &reqwest::Client,
adm_endpoint_url: &str,
fake_ip: &str,
location: &LocationResult,
stripped_ua: &str,
placement: &str,
state: &ServerState,
tags: &mut Tags,
) -> Result<AdmTileResponse, HandlerError> {
// XXX: Assumes adm_endpoint_url includes
// ?partner=<mozilla_partner_name>&sub1=<mozilla_tag_id> (probably should
// validate this on startup)
let settings = &state.settings;
let adm_url = Url::parse_with_params(
adm_endpoint_url,
&[
("ip", fake_ip),
("ua", &stripped_ua),
("sub2", &placement),
("partner", settings.partner_id.as_str()),
("sub1", settings.sub1.as_str()),
("ip", &location.fake_ip), // TODO: remove once ADM API finalized
("ua", &stripped_ua), // TODO: remove once ADM API finalized
("country-code", &location.country()),
("region-code", &location.region()),
// ("dma-code", location.dma),
// ("form-factor", form_factor),
("os-family", stripped_ua),
("sub2", placement),
("v", "1.0"),
// XXX: some value for results seems required, it defaults to 0
// when omitted (despite AdM claiming it would default to 1)
("results", "10"),
("results", &settings.adm_query_tile_count.to_string()),
],
)
.map_err(|e| HandlerError::internal(&e.to_string()))?;
Expand All @@ -57,21 +301,8 @@ pub async fn get_tiles(
response.tiles = response
.tiles
.into_iter()
.filter_map(filter_and_process)
.filter_map(|tile| state.filter.filter_and_process(tile, tags))
.take(settings.adm_max_tiles as usize)
.collect();
Ok(response)
}

/// Filter and process tiles from ADM:
///
/// - Returns None for tiles that shouldn't be shown to the client
/// - Modifies tiles for output to the client (adding additional fields, etc.)
#[allow(clippy::unnecessary_wraps, unused_mut)]
fn filter_and_process(mut tile: AdmTile) -> Option<AdmTile> {
//if !state.valid_tile(tile.name) {
// return None;
//}

// TODO: move images to CDN
Some(tile)
}
22 changes: 19 additions & 3 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,18 @@ pub enum HandlerErrorKind {
#[error("Validation error: {:?}", _0)]
Validation(String),

#[error("Invalid {} Host: {:?}", _0, _1)]
InvalidHost(&'static str, String),

#[error("Unexpected {} Host: {:?}", _0, _1)]
UnexpectedHost(&'static str, String),

#[error("Unexpected Advertiser: {:?}", _0)]
UnexpectedAdvertiser(String),

#[error("Missing {} Host: {:?}", _0, _1)]
MissingHost(&'static str, String),

#[error("Location error: {:?}", _0)]
Location(String),
}
Expand All @@ -56,15 +68,19 @@ impl HandlerErrorKind {
HandlerErrorKind::Internal(_) => 510,
HandlerErrorKind::Reqwest(_) => 520,
HandlerErrorKind::Validation(_) => 600,
HandlerErrorKind::InvalidHost(_, _) => 601,
HandlerErrorKind::UnexpectedHost(_, _) => 602,
HandlerErrorKind::MissingHost(_, _) => 603,
HandlerErrorKind::UnexpectedAdvertiser(_) => 604,
HandlerErrorKind::Location(_) => 530,
}
}

/*
// Optionally record metric for certain states
pub fn on_response(&self, state: &ServerState) {
if self.is_conflict() {
Metrics::from(state).incr("storage.confict")
pub fn is_reportable(&self) -> bool {
match self {
_ => true
}
}
*/
Expand Down
Loading

0 comments on commit 8007794

Please sign in to comment.