Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Facebook login behavior and refactor script template system #54

Merged
merged 2 commits into from
Mar 8, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion bin/browse-url
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@ import os
import sys
import logging
import umbra
import json

arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description='browse-url - open urls in chrome/chromium and run behaviors',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('urls', metavar='URL', nargs='+', help='URL(s) to browse')
arg_parser.add_argument('-m', '--meta-data-json', dest='metadata', default=None,
help='json metadata that would appear in the amqp message')
arg_parser.add_argument('-w', '--browser-wait', dest='browser_wait', default='60',
help='seconds to wait for browser initialization')
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
Expand All @@ -24,7 +27,11 @@ args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')

metadata=None
if args.metadata is not None:
metadata = json.loads(args.metadata)
with umbra.Browser(chrome_exe=args.chrome_exe) as browser:
for url in args.urls:
browser.browse_page(url)
browser.browse_page(url,metadata)


Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]
//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;
var UMBRA_FB_USER_NAME = "${parameter_username}";
var UMBRA_FB_PASSWORD = "${parameter_password}";
var umbraAlreadyClicked = {};
var umbraAlreadyScrolledThing = {};
var umbraScrolledThingFailedScrollAttempts = {};
Expand Down Expand Up @@ -153,6 +155,15 @@ var umbraIntervalFunc = function() {
}
}

var umbraFacebookLogin = function() {
var emailInput = document.querySelector("form#login_form input#email");
var passwordInput = document.querySelector("form#login_form input#pass");
var loginButton = document.querySelector("form#login_form label#loginbutton > input");
emailInput.value=UMBRA_FB_USER_NAME;
passwordInput.value=UMBRA_FB_PASSWORD;
loginButton.click();
}

// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
// time, then we consider ourselves finished with the page.
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
Expand All @@ -169,5 +180,10 @@ var umbraBehaviorFinished = function() {
return false;
}

if (document.getElementById("login_form") == null || UMBRA_FB_USER_NAME.indexOf("parameter")>0 || UMBRA_FB_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters
var umbraIntervalId = setInterval(umbraIntervalFunc, 200);
}
else //login
umbraFacebookLogin();

var umbraIntervalId = setInterval(umbraIntervalFunc, 200);
33 changes: 18 additions & 15 deletions umbra/behaviors.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,46 +16,49 @@ class Behavior:
_behaviors = None

@staticmethod
def behaviors():
def behaviors(template_parameters=None):
if Behavior._behaviors is None:
behaviors_yaml = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.yaml'])
with open(behaviors_yaml) as fin:
conf = yaml.load(fin)
Behavior._behaviors = conf['behaviors']

simpleclicks_js_in = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + ["simpleclicks.js.in"])
with open(simpleclicks_js_in) as fin:
simpleclicks_js_template = string.Template(fin.read())

for behavior in Behavior._behaviors:
if "behavior_js" in behavior:
behavior_js = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js"]])
behavior["script"] = open(behavior_js, encoding="utf-8").read()
elif "click_css_selector" in behavior:
if "click_until_hard_timeout" in behavior:
click_until_hard_timeout_value=behavior["click_until_hard_timeout"]
else:
click_until_hard_timeout_value = False
behavior["script"] = simpleclicks_js_template.substitute(click_css_selector=behavior["click_css_selector"], click_until_hard_timeout=click_until_hard_timeout_value)
elif "behavior_js_template" in behavior:
behavior_js_template = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js_template"]])
with open(behavior_js_template) as fin:
template = string.Template(fin.read())
parameters=dict()
if "default_parameters" in behavior:
for key, value in behavior["default_parameters"].items():
parameters[key]=value;
if template_parameters is not None:
parameters.update(template_parameters)
behavior["script"] = template.safe_substitute(parameters)

return Behavior._behaviors

def __init__(self, url, umbra_worker):
self.url = url
self.umbra_worker = umbra_worker

self.template_parameters = None
if umbra_worker.metadata is not None and umbra_worker.metadata["templateParameters"] is not None:
self.template_parameters = umbra_worker.metadata["templateParameters"]
self.script_finished = False
self.waiting_result_msg_ids = []
self.active_behavior = None
self.last_activity = time.time()

def start(self):
for behavior in Behavior.behaviors():
for behavior in Behavior.behaviors(self.template_parameters):
if re.match(behavior['url_regex'], self.url):
if "behavior_js" in behavior:
self.logger.info("using {} behavior for {}".format(behavior["behavior_js"], self.url))
elif "click_css_selector" in behavior:
self.logger.info("using simple click behavior with css selector {} for {}".format(behavior["click_css_selector"], self.url))
elif "behavior_js_template" in behavior:
self.logger.info("using {} template behavior for {}".format(behavior["behavior_js_template"], self.url))

self.active_behavior = behavior
self.umbra_worker.send_to_chrome(method="Runtime.evaluate",
Expand Down
35 changes: 26 additions & 9 deletions umbra/behaviors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
behaviors:
-
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
behavior_js: facebook.js
behavior_js_template: facebook.js.template
-default_parameters:
- parameter_username: "username"
- parameter_password: "password"
request_idle_timeout_sec: 30
-
url_regex: '^https?://(?:www\.)?flickr\.com/.*$'
Expand All @@ -26,32 +29,46 @@ behaviors:
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
click_css_selector: img.img-responsive
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: img.img-responsive
request_idle_timeout_sec: 10
- # acalog https://webarchive.jira.com/browse/ARI-3775
url_regex: '^https?://.*[?&]catoid=[^?]*$'
click_css_selector: a[onclick]
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[onclick]
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-3956
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
click_css_selector: a[id='feature-next']
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[id='feature-next']
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-451
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
click_css_selector: button.sc-button-play, button.playButton
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.sc-button-play, button.playButton
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-463
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
click_css_selector: button.playButton.medium
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.playButton.medium
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4690
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
click_css_selector: span.load-more-text
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: span.load-more-text
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4725
url_regex: '^https?://(?:www\.)?moma.org/.*$'
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # default fallback brhavior
url_regex: '^.*$'
Expand Down
3 changes: 2 additions & 1 deletion umbra/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,15 @@ def stop(self):
def abort_browse_page(self):
self._abort_browse_page = True

def browse_page(self, url, on_request=None):
def browse_page(self, url, metadata=None, on_request=None):
"""Synchronously browses a page and runs behaviors.

Raises BrowsingException if browsing the page fails in a non-critical
way.
"""
self.url = url
self.on_request = on_request
self.metadata = metadata

self._websock = websocket.WebSocketApp(self._websocket_url,
on_open=self._visit_page, on_message=self._handle_message)
Expand Down
2 changes: 1 addition & 1 deletion umbra/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def on_request(chrome_msg):
def browse_page_sync():
self.logger.info('browser={} client_id={} url={}'.format(browser, client_id, url))
try:
browser.browse_page(url, on_request=on_request)
browser.browse_page(url, parent_url_metadata, on_request=on_request)
message.ack()
except BrowsingException as e:
self.logger.warn("browsing did not complete normally, requeuing url {} - {}".format(url, e))
Expand Down