diff --git a/bin/browse-url b/bin/browse-url index ebfa906..f86712e 100755 --- a/bin/browse-url +++ b/bin/browse-url @@ -1,16 +1,18 @@ #!/usr/bin/env python -# vim: set sw=4 et: import argparse import os import sys import logging import umbra +import json arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), description='browse-url - open urls in chrome/chromium and run behaviors', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('urls', metavar='URL', nargs='+', help='URL(s) to browse') +arg_parser.add_argument('--behavior-parameters', dest='behavior_parameters', + default=None, help='json blob of parameters to use populate the javascript behavior template, e.g. {"parameter_username":"x","parameter_password":"y"}') arg_parser.add_argument('-w', '--browser-wait', dest='browser_wait', default='60', help='seconds to wait for browser initialization') arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser', @@ -24,7 +26,11 @@ args = arg_parser.parse_args(args=sys.argv[1:]) logging.basicConfig(stream=sys.stdout, level=args.log_level, format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') +behavior_parameters = None +if args.behavior_parameters is not None: + behavior_parameters = json.loads(args.behavior_parameters) + with umbra.Browser(chrome_exe=args.chrome_exe) as browser: for url in args.urls: - browser.browse_page(url) + browser.browse_page(url, behavior_parameters=behavior_parameters) diff --git a/umbra/behaviors.d/facebook.js b/umbra/behaviors.d/facebook.js.template similarity index 87% rename from umbra/behaviors.d/facebook.js rename to umbra/behaviors.d/facebook.js.template index d6baf20..e151798 100644 --- a/umbra/behaviors.d/facebook.js +++ b/umbra/behaviors.d/facebook.js.template @@ -22,32 +22,34 @@ var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"] //div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]'; var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5; +var UMBRA_FB_USER_NAME = "${parameter_username}"; +var UMBRA_FB_PASSWORD = "${parameter_password}"; var umbraAlreadyClicked = {}; var umbraAlreadyScrolledThing = {}; var umbraScrolledThingFailedScrollAttempts = {}; var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0}; var umbraIntervalFunc = function() { - + var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR); var everythingScrolled = true; - + for (var i = 0; i < thingsToScroll.length; i++) { var target = thingsToScroll[i]; - + if (!(target in umbraAlreadyScrolledThing)) { - + everythingScrolled = false; - + console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id); var lastScrollTop = target.scrollTop; target.scrollTop = target.scrollHeight; - + umbraState.idleSince = null; - + if (target.scrollTop >= target.scrollHeight) { umbraAlreadyScrolledThing[target] = true; - } + } else if (target.scrollTop == lastScrollTop) { if (umbraScrolledThingFailedScrollAttempts[target]) { umbraScrolledThingFailedScrollAttempts[target]++; @@ -55,7 +57,7 @@ var umbraIntervalFunc = function() { else { umbraScrolledThingFailedScrollAttempts[target] = 1; } - + if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) { umbraAlreadyScrolledThing[target] = true; } @@ -67,24 +69,24 @@ var umbraIntervalFunc = function() { } else { console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id) - } - + } + umbraState.expectingSomething = null; } - + if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) { if (umbraState.idleSince == null) { umbraState.idleSince = Date.now(); } - + return; } - - var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"]'); + + var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"], div.fbPhotoSnowlift.fbxPhoto a._xlt'); for (var i = 0; i < closeButtons.length; i++) { // XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible - if (closeButtons[i].clientWidth > 0) { - if (umbraState.expectingSomething == 'closeButton') { + if (closeButtons[i].clientWidth > 0) { + if (umbraState.expectingSomething == 'closeButton') { console.log("found expected close button, clicking on it " + closeButtons[i].outerHTML); umbraState.expectingSomething = null; } else { @@ -106,7 +108,7 @@ var umbraIntervalFunc = function() { var missedAbove = 0; for (var i = 0; i < thingsToClick.length; i++) { - var target = thingsToClick[i]; + var target = thingsToClick[i]; if (!(target in umbraAlreadyClicked)) { var where = umbraAboveBelowOrOnScreen(target); if (where == 0) { // on screen @@ -122,14 +124,14 @@ var umbraIntervalFunc = function() { clickedSomething = true; umbraState.idleSince = null; break; - } else if (where > 0) { + } else if (where > 0) { somethingLeftBelow = true; } else if (where < 0) { somethingLeftAbove = true; } } } - + if (window.scrollY > umbraState.bottomReachedScrollY) { umbraState.bottomReachedScrollY = window.scrollY; } @@ -153,13 +155,22 @@ var umbraIntervalFunc = function() { } } +var umbraFacebookLogin = function() { + var emailInput = document.querySelector("form#login_form input#email"); + var passwordInput = document.querySelector("form#login_form input#pass"); + var loginButton = document.querySelector("form#login_form label#loginbutton > input"); + emailInput.value=UMBRA_FB_USER_NAME; + passwordInput.value=UMBRA_FB_PASSWORD; + loginButton.click(); +} + // If we haven't had anything to do (scrolled, clicked, etc) in this amount of // time, then we consider ourselves finished with the page. var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10; // Called from outside of this script. var umbraBehaviorFinished = function() { - + if (umbraState.idleSince != null) { var idleTimeMs = Date.now() - umbraState.idleSince; if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { @@ -169,5 +180,10 @@ var umbraBehaviorFinished = function() { return false; } +if (document.getElementById("login_form") == null || UMBRA_FB_USER_NAME.indexOf("parameter")>0 || UMBRA_FB_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters + var umbraIntervalId = setInterval(umbraIntervalFunc, 200); +} +else //login + umbraFacebookLogin(); + -var umbraIntervalId = setInterval(umbraIntervalFunc, 200); diff --git a/umbra/behaviors.d/simpleclicks.js.in b/umbra/behaviors.d/simpleclicks.js.template similarity index 100% rename from umbra/behaviors.d/simpleclicks.js.in rename to umbra/behaviors.d/simpleclicks.js.template diff --git a/umbra/behaviors.py b/umbra/behaviors.py index 0be9e4c..46f804e 100644 --- a/umbra/behaviors.py +++ b/umbra/behaviors.py @@ -23,43 +23,49 @@ def behaviors(): conf = yaml.load(fin) Behavior._behaviors = conf['behaviors'] - simpleclicks_js_in = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + ["simpleclicks.js.in"]) - with open(simpleclicks_js_in) as fin: - simpleclicks_js_template = string.Template(fin.read()) - for behavior in Behavior._behaviors: if "behavior_js" in behavior: behavior_js = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js"]]) - behavior["script"] = open(behavior_js, encoding="utf-8").read() - elif "click_css_selector" in behavior: - if "click_until_hard_timeout" in behavior: - click_until_hard_timeout_value=behavior["click_until_hard_timeout"] - else: - click_until_hard_timeout_value = False - behavior["script"] = simpleclicks_js_template.substitute(click_css_selector=behavior["click_css_selector"], click_until_hard_timeout=click_until_hard_timeout_value) + with open(behavior_js, encoding="utf-8") as fin: + behavior["script"] = fin.read() + elif "behavior_js_template" in behavior: + behavior_js_template = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js_template"]]) + with open(behavior_js_template, encoding="utf-8") as fin: + behavior["template"] = string.Template(fin.read()) return Behavior._behaviors def __init__(self, url, umbra_worker): self.url = url self.umbra_worker = umbra_worker - self.script_finished = False self.waiting_result_msg_ids = [] self.active_behavior = None self.last_activity = time.time() - def start(self): + def start(self, template_parameters=None): for behavior in Behavior.behaviors(): if re.match(behavior['url_regex'], self.url): if "behavior_js" in behavior: - self.logger.info("using {} behavior for {}".format(behavior["behavior_js"], self.url)) - elif "click_css_selector" in behavior: - self.logger.info("using simple click behavior with css selector {} for {}".format(behavior["click_css_selector"], self.url)) + self.logger.info("using %s behavior for %s", + behavior["behavior_js"], self.url) + elif "behavior_js_template" in behavior: + parameters = dict() + if "default_parameters" in behavior: + parameters.update(behavior["default_parameters"]) + if template_parameters: + parameters.update(template_parameters) + behavior["script"] = behavior["template"].safe_substitute(parameters) + + self.logger.info( + "using template=%s populated with parameters=%s for %s", + repr(behavior["behavior_js_template"]), + parameters, self.url) self.active_behavior = behavior - self.umbra_worker.send_to_chrome(method="Runtime.evaluate", - suppress_logging=True, params={"expression": behavior["script"]}) + self.umbra_worker.send_to_chrome( + method="Runtime.evaluate", suppress_logging=True, + params={"expression": behavior["script"]}) self.notify_of_activity() return diff --git a/umbra/behaviors.yaml b/umbra/behaviors.yaml index 2284d88..6297796 100644 --- a/umbra/behaviors.yaml +++ b/umbra/behaviors.yaml @@ -2,7 +2,10 @@ behaviors: - url_regex: '^https?://(?:www\.)?facebook\.com/.*$' - behavior_js: facebook.js + behavior_js_template: facebook.js.template + # default_parameters: + # parameter_username: jdoe@example.com + # parameter_password: abcd1234 request_idle_timeout_sec: 30 - url_regex: '^https?://(?:www\.)?flickr\.com/.*$' @@ -24,36 +27,50 @@ behaviors: url_regex: '^https?://(?:www\.)?instagram\.com/.*$' behavior_js: instagram.js request_idle_timeout_sec: 10 - - + - url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' - click_css_selector: img.img-responsive + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: img.img-responsive request_idle_timeout_sec: 10 - # acalog https://webarchive.jira.com/browse/ARI-3775 url_regex: '^https?://.*[?&]catoid=[^?]*$' - click_css_selector: a[onclick] + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: a[onclick] request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/ARI-3956 url_regex: '^https?://(?:www\.)?usask.ca/.*$' - click_css_selector: a[id='feature-next'] + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: a[id='feature-next'] request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/AITFIVE-451 url_regex: '^https?://(?:www\.)?soundcloud.com/.*$' - click_css_selector: button.sc-button-play, button.playButton + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: button.sc-button-play, button.playButton request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/AITFIVE-463 url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$' - click_css_selector: button.playButton.medium + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: button.playButton.medium request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/ARI-4690 url_regex: '^https?://(?:www\.)?youtube.com/.*$' - click_css_selector: span.load-more-text + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: span.load-more-text request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/ARI-4725 url_regex: '^https?://(?:www\.)?moma.org/.*$' - click_css_selector: button[data-more-results-bottom-button] - click_until_hard_timeout: True - request_idle_timeout_sec: 10 - - # default fallback brhavior + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: button[data-more-results-bottom-button] + click_until_hard_timeout: True + request_idle_timeout_sec: 10 + - # default fallback behavior url_regex: '^.*$' request_idle_timeout_sec: 10 behavior_js: default.js diff --git a/umbra/browser.py b/umbra/browser.py index 3176a97..949c245 100644 --- a/umbra/browser.py +++ b/umbra/browser.py @@ -98,14 +98,16 @@ def stop(self): def abort_browse_page(self): self._abort_browse_page = True - def browse_page(self, url, on_request=None): - """Synchronously browses a page and runs behaviors. + def browse_page(self, url, on_request=None, on_response=None, behavior_parameters=None): + """Synchronously browses a page and runs behaviors. Raises BrowsingException if browsing the page fails in a non-critical way. """ self.url = url self.on_request = on_request + self.on_response = on_response + self.behavior_parameters = behavior_parameters self._websock = websocket.WebSocketApp(self._websocket_url, on_open=self._visit_page, on_message=self._handle_message) @@ -170,24 +172,29 @@ def _visit_page(self, websock): # navigate to the page! self.send_to_chrome(method="Page.navigate", params={"url": self.url}) - def _handle_message(self, websock, message): + def _handle_message(self, websock, json_message): # self.logger.debug("message from {} - {}".format(websock.url, message[:95])) # self.logger.debug("message from {} - {}".format(websock.url, message)) - message = json.loads(message) + message = json.loads(json_message) if "method" in message and message["method"] == "Network.requestWillBeSent": + self.logger.debug("%s %s", message["method"], json_message) if self._behavior: self._behavior.notify_of_activity() - if message["params"]["request"]["url"].lower().startswith("data:"): - self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80])) - elif self.on_request: + if self.on_request: self.on_request(message) + elif "method" in message and message["method"] == "Network.responseReceived": + self.logger.debug("%s %s", message["method"], json_message) + if self.on_response: + self.on_response(message) elif "method" in message and message["method"] == "Page.loadEventFired": if self._behavior is None: - self.logger.info("Page.loadEventFired, starting behaviors url={} message={}".format(self.url, message)) + self.logger.info("Page.loadEventFired, starting behaviors url=%s message=%s", + self.url, json_message) self._behavior = Behavior(self.url, self) - self._behavior.start() + self._behavior.start(self.behavior_parameters) else: - self.logger.warn("Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url={} message={}".format(self.url, message)) + self.logger.warn("Page.loadEventFired again, perhaps original url had a meta refresh, or behaviors accidentally navigated to another page? starting behaviors again url=%s message=%s", + self.url, json_message) self._behavior = Behavior(self.url, self) self._behavior.start() elif "method" in message and message["method"] == "Console.messageAdded": @@ -198,7 +205,7 @@ def _handle_message(self, websock, message): # We hit the breakpoint set in visit_page. Get rid of google # analytics script! - self.logger.debug("debugger paused! message={}".format(message)) + self.logger.debug("debugger paused! message=%s", json_message) scriptId = message['params']['callFrames'][0]['location']['scriptId'] # replace script @@ -212,9 +219,9 @@ def _handle_message(self, websock, message): # elif "method" in message and message["method"] in ("Network.dataReceived", "Network.responseReceived", "Network.loadingFinished"): # pass # elif "method" in message: - # self.logger.debug("{} {}".format(message["method"], message)) + # self.logger.debug("{} {}".format(message["method"], json_message)) # else: - # self.logger.debug("[no-method] {}".format(message)) + # self.logger.debug("[no-method] {}".format(json_message)) class Chrome: @@ -246,7 +253,8 @@ def start(self): "--window-size=1100,900", "--no-default-browser-check", "--disable-first-run-ui", "--no-first-run", "--homepage=about:blank", "--disable-direct-npapi-requests", - "--disable-web-security", + "--disable-web-security", "--disable-notifications", + "--disable-save-password-bubble", "about:blank"] self.logger.info("running {}".format(chrome_args)) self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True) diff --git a/umbra/controller.py b/umbra/controller.py index f5b5d41..e80a66f 100644 --- a/umbra/controller.py +++ b/umbra/controller.py @@ -18,6 +18,7 @@ class AmqpBrowserController: { "clientId": "umbra.client.123", "url": "http://example.com/my_fancy_page", + "behaviorParameters": {"some":"parameter","another":"thing"}, "metadata": {"arbitrary":"fields", "etc":4} } @@ -28,6 +29,8 @@ class AmqpBrowserController: It sends this information on the same specified amqp exchange (default: "umbra"). + "behaviorParameters" are used to populate the javascript behavior template. + Each url requested in the browser is published to amqp this way. The outgoing amqp message is a json object: @@ -100,15 +103,20 @@ def _wait_for_and_browse_urls(self, conn, consumer, timeout): def callback(body, message): try: - client_id, url, metadata = body['clientId'], body['url'], body['metadata'] + client_id = body.get('clientId') + url = body['url'] + metadata = body.get('metadata') + behavior_parameters = body.get('behaviorParameters') except: - self.logger.error("unable to decipher message {}".format(message), exc_info=True) + self.logger.error("unable to decipher message %s", + message, exc_info=True) self.logger.error("discarding bad message") message.reject() browser.stop() self._browser_pool.release(browser) return - self._start_browsing_page(browser, message, client_id, url, metadata) + self._start_browsing_page(browser, message, client_id, url, + metadata, behavior_parameters) consumer.callbacks = [callback] @@ -173,20 +181,46 @@ def _consume_amqp(self): time.sleep(0.5) self.logger.error("attempting to reopen amqp connection") - def _start_browsing_page(self, browser, message, client_id, url, parent_url_metadata): - def on_request(chrome_msg): - payload = chrome_msg['params']['request'] - payload['parentUrl'] = url - payload['parentUrlMetadata'] = parent_url_metadata - self.logger.debug('sending to amqp exchange={} routing_key={} payload={}'.format(self.exchange_name, client_id, payload)) + def _start_browsing_page(self, browser, message, client_id, url, parent_url_metadata, behavior_parameters): + def on_response(chrome_msg): + if (chrome_msg['params']['response']['url'].lower().startswith('data:') + or chrome_msg['params']['response']['fromDiskCache'] + or not 'requestHeaders' in chrome_msg['params']['response']): + return + + payload = { + 'url': chrome_msg['params']['response']['url'], + 'headers': chrome_msg['params']['response']['requestHeaders'], + 'parentUrl': url, + 'parentUrlMetadata': parent_url_metadata, + } + + if ':method' in chrome_msg['params']['response']['requestHeaders']: + # happens when http transaction is http 2.0 + payload['method'] = chrome_msg['params']['response']['requestHeaders'][':method'] + elif 'requestHeadersText' in chrome_msg['params']['response']: + req = chrome_msg['params']['response']['requestHeadersText'] + payload['method'] = req[:req.index(' ')] + else: + self.logger.warn('unable to identify http method (assuming GET) chrome_msg=%s', + chrome_msg) + payload['method'] = 'GET' + + self.logger.debug( + 'sending to amqp exchange=%s routing_key=%s payload=%s', + self.exchange_name, client_id, payload) with self._producer_lock: - publish = self._producer_conn.ensure(self._producer, self._producer.publish) + publish = self._producer_conn.ensure(self._producer, + self._producer.publish) publish(payload, exchange=self._exchange, routing_key=client_id) def browse_page_sync(): - self.logger.info('browser={} client_id={} url={}'.format(browser, client_id, url)) + self.logger.info( + 'browser=%s client_id=%s url=%s behavior_parameters=%s', + browser, client_id, url, behavior_parameters) try: - browser.browse_page(url, on_request=on_request) + browser.browse_page(url, on_response=on_response, + behavior_parameters=behavior_parameters) message.ack() except BrowsingException as e: self.logger.warn("browsing did not complete normally, requeuing url {} - {}".format(url, e))