Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

https://github.com/internetarchive/umbra/pull/54 plus more refactoring #55

Merged
merged 5 commits into from
Mar 8, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions bin/browse-url
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
#!/usr/bin/env python
# vim: set sw=4 et:

import argparse
import os
import sys
import logging
import umbra
import json

arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description='browse-url - open urls in chrome/chromium and run behaviors',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('urls', metavar='URL', nargs='+', help='URL(s) to browse')
arg_parser.add_argument('--behavior-parameters', dest='behavior_parameters',
default=None, help='json blob of parameters to use populate the javascript behavior template, e.g. {"parameter_username":"x","parameter_password":"y"}')
arg_parser.add_argument('-w', '--browser-wait', dest='browser_wait', default='60',
help='seconds to wait for browser initialization')
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
Expand All @@ -24,7 +26,11 @@ args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')

behavior_parameters = None
if args.behavior_parameters is not None:
behavior_parameters = json.loads(args.behavior_parameters)

with umbra.Browser(chrome_exe=args.chrome_exe) as browser:
for url in args.urls:
browser.browse_page(url)
browser.browse_page(url, behavior_parameters=behavior_parameters)

Original file line number Diff line number Diff line change
Expand Up @@ -22,40 +22,42 @@ var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]
//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;
var UMBRA_FB_USER_NAME = "${parameter_username}";
var UMBRA_FB_PASSWORD = "${parameter_password}";
var umbraAlreadyClicked = {};
var umbraAlreadyScrolledThing = {};
var umbraScrolledThingFailedScrollAttempts = {};
var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0};

var umbraIntervalFunc = function() {

var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR);
var everythingScrolled = true;

for (var i = 0; i < thingsToScroll.length; i++) {
var target = thingsToScroll[i];

if (!(target in umbraAlreadyScrolledThing)) {

everythingScrolled = false;

console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id);
var lastScrollTop = target.scrollTop;
target.scrollTop = target.scrollHeight;

umbraState.idleSince = null;

if (target.scrollTop >= target.scrollHeight) {
umbraAlreadyScrolledThing[target] = true;
}
}
else if (target.scrollTop == lastScrollTop) {
if (umbraScrolledThingFailedScrollAttempts[target]) {
umbraScrolledThingFailedScrollAttempts[target]++;
}
else {
umbraScrolledThingFailedScrollAttempts[target] = 1;
}

if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) {
umbraAlreadyScrolledThing[target] = true;
}
Expand All @@ -67,24 +69,24 @@ var umbraIntervalFunc = function() {
}
else {
console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id)
}
}

umbraState.expectingSomething = null;
}

if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) {
if (umbraState.idleSince == null) {
umbraState.idleSince = Date.now();
}

return;
}
var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"]');

var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"], div.fbPhotoSnowlift.fbxPhoto a._xlt');
for (var i = 0; i < closeButtons.length; i++) {
// XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible
if (closeButtons[i].clientWidth > 0) {
if (umbraState.expectingSomething == 'closeButton') {
if (closeButtons[i].clientWidth > 0) {
if (umbraState.expectingSomething == 'closeButton') {
console.log("found expected close button, clicking on it " + closeButtons[i].outerHTML);
umbraState.expectingSomething = null;
} else {
Expand All @@ -106,7 +108,7 @@ var umbraIntervalFunc = function() {
var missedAbove = 0;

for (var i = 0; i < thingsToClick.length; i++) {
var target = thingsToClick[i];
var target = thingsToClick[i];
if (!(target in umbraAlreadyClicked)) {
var where = umbraAboveBelowOrOnScreen(target);
if (where == 0) { // on screen
Expand All @@ -122,14 +124,14 @@ var umbraIntervalFunc = function() {
clickedSomething = true;
umbraState.idleSince = null;
break;
} else if (where > 0) {
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
}
}

if (window.scrollY > umbraState.bottomReachedScrollY) {
umbraState.bottomReachedScrollY = window.scrollY;
}
Expand All @@ -153,13 +155,22 @@ var umbraIntervalFunc = function() {
}
}

var umbraFacebookLogin = function() {
var emailInput = document.querySelector("form#login_form input#email");
var passwordInput = document.querySelector("form#login_form input#pass");
var loginButton = document.querySelector("form#login_form label#loginbutton > input");
emailInput.value=UMBRA_FB_USER_NAME;
passwordInput.value=UMBRA_FB_PASSWORD;
loginButton.click();
}

// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
// time, then we consider ourselves finished with the page.
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;

// Called from outside of this script.
var umbraBehaviorFinished = function() {

if (umbraState.idleSince != null) {
var idleTimeMs = Date.now() - umbraState.idleSince;
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
Expand All @@ -169,5 +180,10 @@ var umbraBehaviorFinished = function() {
return false;
}

if (document.getElementById("login_form") == null || UMBRA_FB_USER_NAME.indexOf("parameter")>0 || UMBRA_FB_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters
var umbraIntervalId = setInterval(umbraIntervalFunc, 200);
}
else //login
umbraFacebookLogin();


var umbraIntervalId = setInterval(umbraIntervalFunc, 200);
42 changes: 24 additions & 18 deletions umbra/behaviors.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,43 +23,49 @@ def behaviors():
conf = yaml.load(fin)
Behavior._behaviors = conf['behaviors']

simpleclicks_js_in = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + ["simpleclicks.js.in"])
with open(simpleclicks_js_in) as fin:
simpleclicks_js_template = string.Template(fin.read())

for behavior in Behavior._behaviors:
if "behavior_js" in behavior:
behavior_js = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js"]])
behavior["script"] = open(behavior_js, encoding="utf-8").read()
elif "click_css_selector" in behavior:
if "click_until_hard_timeout" in behavior:
click_until_hard_timeout_value=behavior["click_until_hard_timeout"]
else:
click_until_hard_timeout_value = False
behavior["script"] = simpleclicks_js_template.substitute(click_css_selector=behavior["click_css_selector"], click_until_hard_timeout=click_until_hard_timeout_value)
with open(behavior_js, encoding="utf-8") as fin:
behavior["script"] = fin.read()
elif "behavior_js_template" in behavior:
behavior_js_template = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js_template"]])
with open(behavior_js_template, encoding="utf-8") as fin:
behavior["template"] = string.Template(fin.read())

return Behavior._behaviors

def __init__(self, url, umbra_worker):
self.url = url
self.umbra_worker = umbra_worker

self.script_finished = False
self.waiting_result_msg_ids = []
self.active_behavior = None
self.last_activity = time.time()

def start(self):
def start(self, template_parameters=None):
for behavior in Behavior.behaviors():
if re.match(behavior['url_regex'], self.url):
if "behavior_js" in behavior:
self.logger.info("using {} behavior for {}".format(behavior["behavior_js"], self.url))
elif "click_css_selector" in behavior:
self.logger.info("using simple click behavior with css selector {} for {}".format(behavior["click_css_selector"], self.url))
self.logger.info("using %s behavior for %s",
behavior["behavior_js"], self.url)
elif "behavior_js_template" in behavior:
parameters = dict()
if "default_parameters" in behavior:
parameters.update(behavior["default_parameters"])
if template_parameters:
parameters.update(template_parameters)
behavior["script"] = behavior["template"].safe_substitute(parameters)

self.logger.info(
"using template=%s populated with parameters=%s for %s",
repr(behavior["behavior_js_template"]),
parameters, self.url)

self.active_behavior = behavior
self.umbra_worker.send_to_chrome(method="Runtime.evaluate",
suppress_logging=True, params={"expression": behavior["script"]})
self.umbra_worker.send_to_chrome(
method="Runtime.evaluate", suppress_logging=True,
params={"expression": behavior["script"]})
self.notify_of_activity()
return

Expand Down
41 changes: 29 additions & 12 deletions umbra/behaviors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
behaviors:
-
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
behavior_js: facebook.js
behavior_js_template: facebook.js.template
# default_parameters:
# parameter_username: jdoe@example.com
# parameter_password: abcd1234
request_idle_timeout_sec: 30
-
url_regex: '^https?://(?:www\.)?flickr\.com/.*$'
Expand All @@ -24,36 +27,50 @@ behaviors:
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
behavior_js: instagram.js
request_idle_timeout_sec: 10
-
-
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
click_css_selector: img.img-responsive
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: img.img-responsive
request_idle_timeout_sec: 10
- # acalog https://webarchive.jira.com/browse/ARI-3775
url_regex: '^https?://.*[?&]catoid=[^?]*$'
click_css_selector: a[onclick]
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[onclick]
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-3956
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
click_css_selector: a[id='feature-next']
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[id='feature-next']
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-451
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
click_css_selector: button.sc-button-play, button.playButton
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.sc-button-play, button.playButton
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-463
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
click_css_selector: button.playButton.medium
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.playButton.medium
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4690
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
click_css_selector: span.load-more-text
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: span.load-more-text
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4725
url_regex: '^https?://(?:www\.)?moma.org/.*$'
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # default fallback brhavior
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # default fallback behavior
url_regex: '^.*$'
request_idle_timeout_sec: 10
behavior_js: default.js
Loading