Skip to content
This repository has been archived by the owner on Feb 28, 2019. It is now read-only.

Commit

Permalink
Great performance improve (about 8x faster in text-like content rewrite)
Browse files Browse the repository at this point in the history
  • Loading branch information
aploium committed May 11, 2016
1 parent ba5a018 commit 1576283
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 49 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
### Project Special
/custom_func.py
/config.py
/tests/sample
/ip_whitelist.txt
### Vim template
# swap
Expand Down
86 changes: 37 additions & 49 deletions EasyWebsiteMirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
import os

os.chdir(os.path.dirname(__file__))
import requests
import traceback
from datetime import datetime, timedelta
import re
import base64
import zlib
from time import time
from html import escape as html_escape
import threading
from urllib.parse import urljoin, urlsplit, urlunsplit
import requests
from flask import Flask, request, make_response, Response, redirect
from ColorfulPyPrint import * # TODO: Migrate logging tools to the stdlib

Expand Down Expand Up @@ -41,7 +42,7 @@
errprint('Can Not Create Local File Cache: ', e, ' local file cache is disabled automatically.')
local_cache_enable = False

__VERSION__ = '0.15.1-dev'
__VERSION__ = '0.16.1-dev'
__author__ = 'Aploium <i@z.codes>'
static_file_extensions_list = set(static_file_extensions_list)
external_domains_set = set(external_domains or [])
Expand All @@ -52,6 +53,10 @@
myurl_prefix_escaped = myurl_prefix.replace('/', r'\/')
cdn_domains_number = len(CDN_domains)

# ## thread local var ##
thread_local = threading.local()
thread_local.start_time = None

# ########## Handle dependencies #############
if not enable_static_resource_CDN:
mime_based_static_resource_CDN = False
Expand Down Expand Up @@ -110,14 +115,7 @@
)
regex_extract_base64_from_embedded_url = re.compile(
r'_ewm0(?P<gzip>z?)_\.(?P<b64>[a-zA-Z0-9-_]+=*)\._ewm1_\.[a-zA-Z\d]+\b')
# Basic url rewriter for external sites, see function response_text_rewrite()
regex_basic_ext_url_rewriter = {}
regex_basic_ext_url_esc_rewriter = {}
for _domain in external_domains:
regex_basic_ext_url_rewriter[_domain] = re.compile(r'(https?:)?//' + re.escape(_domain), flags=re.IGNORECASE)
# TODO: Combine it together with regex_basic_ext_url_rewriter
regex_basic_ext_url_esc_rewriter[_domain] = re.compile(r'(https?:)?\\/\\/' + re.escape(_domain),
flags=re.IGNORECASE)

# Response Cookies Rewriter, see response_cookie_rewrite()
regex_cookie_rewriter = re.compile(r'\bdomain=(\.?([\w-]+\.)+\w+)\b', flags=re.IGNORECASE)
# Request Domains Rewriter, see rewrite_client_requests_text()
Expand Down Expand Up @@ -419,6 +417,7 @@ def put_response_to_local_cache(url, our_resp, req, remote_resp):
def try_get_cached_response(url, client_header):
"""
:param url: real url with query string
:type client_header: dict
"""
# Only use cache when client use GET
Expand Down Expand Up @@ -484,10 +483,9 @@ def regex_url_reassemble(match_obj):
# only url(something) and @import are allowed to be unquoted
or ('url' not in prefix and 'import' not in prefix) and (not quote_left or quote_right == ')')
# for "key":"value" type replace, we must have at least one '/' in url path (for the value to be regard as url)
or (':' in prefix and '/' not in path)):
or (':' in prefix and '/' not in path)
):
return whole_match_string
else:
url_rewrite_cache_miss_count += 1

remote_path = request.path
if request.path[:11] == '/extdomains':
Expand Down Expand Up @@ -564,6 +562,7 @@ def regex_url_reassemble(match_obj):
# write the adv rewrite cache only if we disable CDN or we known whether this url is CDN-able
if not mime_based_static_resource_CDN or _we_knew_this_url:
url_rewrite_cache[match_obj.group()] = reassembled # write cache
url_rewrite_cache_miss_count += 1

return reassembled

Expand Down Expand Up @@ -661,6 +660,7 @@ def copy_response(requests_response_obj, content=b''):
return resp


# noinspection PyProtectedMember
def response_cookies_deep_copy(req_obj):
"""
It's a BAD hack to get RAW cookies headers, but so far, we don't have better way.
Expand Down Expand Up @@ -711,8 +711,8 @@ def response_content_rewrite(remote_resp_obj):
if custom_text_rewriter_enable and content_mime == 'text/html':
resp_text2 = custom_response_html_rewriter(resp_text)
resp_text = resp_text2
except Exception as e: # just print err and fallback to normal rewrite
errprint('Custom Rewrite Function "custom_response_html_rewriter(text)" in custom_func.py ERROR', e)
except Exception as _e: # just print err and fallback to normal rewrite
errprint('Custom Rewrite Function "custom_response_html_rewriter(text)" in custom_func.py ERROR', _e)
traceback.print_exc()

# then do the normal rewrites
Expand Down Expand Up @@ -751,28 +751,18 @@ def response_text_rewrite(resp_text):
resp_text = resp_text.replace(r'https:\/\/' + domain, # TODO: Combine it with non-escaped version
myurl_prefix_escaped + r'\/extdomains\/' + 'https-' + domain)
# Implicit schemes replace, will be replaced to the same as `my_host_scheme`, unless forced
resp_text = regex_basic_ext_url_rewriter[domain].sub(
'{0}{1}/extdomains/{2}{3}'.format(
my_host_scheme,
my_host_name,
('https-' if ('NONE' != force_https_domains)
and (
'ALL' == force_https_domains or domain in force_https_domains
) else ''),
domain),
resp_text
)

resp_text = regex_basic_ext_url_esc_rewriter[domain].sub( # TODO: Combine it with non-escaped version
'{0}\\/extdomains\\/{1}{2}'.format(
myurl_prefix_escaped,
('https-' if ('NONE' != force_https_domains)
and (
'ALL' == force_https_domains or domain in force_https_domains
) else ''),
domain),
resp_text
)
buff = '{0}/extdomains/{1}{2}'.format(
myurl_prefix,
('https-' if ('NONE' != force_https_domains)
and (
'ALL' == force_https_domains or domain in force_https_domains
) else ''),
domain)
resp_text = resp_text.replace('http://' + domain, buff, )
resp_text = resp_text.replace('http:\\/\\/' + domain, buff.replace('/', r'\/'))
resp_text = resp_text.replace('//' + domain, buff)
resp_text = resp_text.replace('\\/\\/' + domain, buff.replace('/', r'\/'), )

# rewrite "foo.domain.tld" and 'foo.domain.tld'
resp_text = resp_text.replace('"%s"' % domain, '\"' + my_host_name + '/extdomains/' + domain + '\"')
Expand Down Expand Up @@ -887,7 +877,7 @@ def send_request(url, method='GET', headers=None, param_get=None, data=None):
return r, req_time


def request_remote_site_and_parse(actual_request_url, start_time=None):
def request_remote_site_and_parse(actual_request_url):
if verbose_level >= 3: dbgprint('actual_request_url:', actual_request_url)

if mime_based_static_resource_CDN:
Expand All @@ -909,8 +899,8 @@ def request_remote_site_and_parse(actual_request_url, start_time=None):
resp = try_get_cached_response(actual_request_url, client_header)
if resp is not None:
dbgprint('CacheHit,Return')
if start_time is not None:
resp.headers.set('X-CP-Time', "%.4f" % (time() - start_time))
if thread_local.start_time is not None:
resp.headers.set('X-CP-Time', "%.4f" % (time() - thread_local.start_time))
return resp # If cache hit, just skip next steps

try: # send request to remote server
Expand Down Expand Up @@ -950,8 +940,8 @@ def request_remote_site_and_parse(actual_request_url, start_time=None):

if local_cache_enable: # storge entire our server's response (headers included)
put_response_to_local_cache(actual_request_url, resp, request, r)
if start_time is not None:
resp.headers.add('X-CP-Time', "%.4f" % (time() - start_time - req_time))
if thread_local.start_time is not None:
resp.headers.add('X-CP-Time', "%.4f" % (time() - thread_local.start_time - req_time))
return resp


Expand All @@ -975,9 +965,8 @@ def filter_client_request():
if verbose_level >= 3: dbgprint('add to ip_whitelist because cookies:', request.remote_addr)
else:
return redirect(
"/ip_ban_verify_page?origin="
+ base64.urlsafe_b64encode(str(request.url).encode(encoding='utf-8')).decode()
, code=302)
"/ip_ban_verify_page?origin=" + base64.urlsafe_b64encode(str(request.url).encode(encoding='utf-8')).decode(),
code=302)

return None

Expand All @@ -1003,7 +992,6 @@ def rewrite_client_request():
try:
real_url = extract_real_url_from_embedded_url(request.url)
if real_url is not None:
global request
request.url = real_url
request.path = urlsplit(real_url).path
except:
Expand Down Expand Up @@ -1120,7 +1108,7 @@ def ip_ban_verify_page():
@app.route('/extdomains/<path:hostname>', methods=['GET', 'POST'])
@app.route('/extdomains/<path:hostname>/<path:extpath>', methods=['GET', 'POST'])
def get_external_site(hostname, extpath='/'):
start_time = time() # to display compute time
thread_local.start_time = time() # to display compute time
# pre-filter client's request
filter_or_rewrite_result = filter_client_request() or is_client_request_need_redirect()

Expand All @@ -1145,13 +1133,13 @@ def get_external_site(hostname, extpath='/'):
if verbose_level >= 3: dbgprint('after extract, url:', request.url, ' path:', request.path)
actual_request_url = urljoin(urljoin(scheme + hostname, extpath), '?' + urlsplit(request.url).query)

return request_remote_site_and_parse(actual_request_url, start_time)
return request_remote_site_and_parse(actual_request_url)


@app.route('/', methods=['GET', 'POST'])
@app.route('/<path:input_path>', methods=['GET', 'POST'])
def get_main_site(input_path='/'):
start_time = time() # to display compute time
thread_local.start_time = time() # to display compute time
# pre-filter client's request
filter_or_rewrite_result = filter_client_request() or is_client_request_need_redirect()
if filter_or_rewrite_result is not None:
Expand All @@ -1165,7 +1153,7 @@ def get_main_site(input_path='/'):

actual_request_url = urljoin(target_scheme + target_domain, extract_url_path_and_query(request.url))

return request_remote_site_and_parse(actual_request_url, start_time)
return request_remote_site_and_parse(actual_request_url)


# ################# End Flask #################
Expand Down
23 changes: 23 additions & 0 deletions tests/regex_rewriter_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding=utf-8
import re
from time import time
from config import *
from EasyWebsiteMirror import regex_adv_url_rewriter, regex_url_reassemble, \
static_file_extensions_list, external_domains_set, allowed_domains_set, myurl_prefix, cdn_domains_number, \
Expand Down Expand Up @@ -185,3 +186,25 @@ class DbgRequest:
infoprint('All', len(test_cases), 'tests passed')
else:
errprint('Failed in ', fail_count, 'tests')

infoprint('Begin Performance Test')

with open(os.path.join(os.path.dirname(__file__),'sample', 'google_home.html'),'r') as fp:
buff = fp.read()
try:
regex_adv_url_rewriter.cache_clear()
except:
pass
start_time = time()
regex_adv_url_rewriter.sub(regex_url_reassemble,buff)
infoprint('google_home.html',time()-start_time)

with open(os.path.join(os.path.dirname(__file__),'sample', 'google_script.js'),'r') as fp:
buff = fp.read()
try:
regex_adv_url_rewriter.cache_clear()
except:
pass
start_time = time()
regex_adv_url_rewriter.sub(regex_url_reassemble,buff)
infoprint('google_script.js',time()-start_time)

0 comments on commit 1576283

Please sign in to comment.