-
Notifications
You must be signed in to change notification settings - Fork 537
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Developed 1 new feature, fixed 2 bugs #65
base: master
Are you sure you want to change the base?
Changes from all commits
03d7ba7
8ec66dd
db3b424
9951b41
54c32ac
d4f4a3e
f8f314d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,8 @@ | |
from cola.functions.speed import SpeedControlClient | ||
from cola.functions.counter import CounterClient | ||
|
||
MAX_IDLE_TIMES = 5 | ||
|
||
class Container(object): | ||
def __init__(self, container_id, working_dir, | ||
job_path, job_name, env, mq, | ||
|
@@ -125,9 +127,16 @@ def sync(): | |
|
||
def _init_idle_status_checker(self): | ||
def check(): | ||
idle_times = 0 | ||
while not self.stopped.is_set(): | ||
self.idle_statuses[self.container_id] = \ | ||
all([task.is_idle() for task in self.tasks]) | ||
if self.idle_statuses[self.container_id]: | ||
idle_times += 1 | ||
if self.job_desc.settings.job.size=='auto' and idle_times > MAX_IDLE_TIMES: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. == 两边空格 |
||
break | ||
else: | ||
idle_times = 0 | ||
self.stopped.wait(5) | ||
self.check_idle_t = threading.Thread(target=check) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,8 +71,7 @@ def __init__(self, id_, job_desc, mq, | |
is_local=False, env=None, logger=None): | ||
self.id_ = id_ | ||
self.job_desc = job_desc | ||
self.opener = job_desc.opener_cls( | ||
timeout=DEFAULT_OPENER_TIMEOUT) | ||
self.opener = job_desc.opener_cls(timeout=DEFAULT_OPENER_TIMEOUT) | ||
self.mq = mq | ||
self.dir_ = working_dir | ||
self.settings = job_desc.settings | ||
|
@@ -116,8 +115,7 @@ def _configure_proxy(self): | |
for p in proxies: | ||
proxy_type = p.type if p.has('type') else 'all' | ||
if p.has('addr'): | ||
self.opener.add_proxy( | ||
p.addr, | ||
self.opener.add_proxy(p.addr, | ||
proxy_type=proxy_type, | ||
user=p.user if p.has('user') else None, | ||
password=p.password if p.has('password') else None) | ||
|
@@ -210,7 +208,7 @@ def _pack_error(self, url, msg, error, content=None, | |
|
||
msg_filename = os.path.join(path, ERROR_MSG_FILENAME) | ||
with open(msg_filename, 'w') as f: | ||
f.write(msg+'\n') | ||
f.write(msg + '\n') | ||
traceback.print_exc(file=f) | ||
|
||
content_filename = os.path.join(path, | ||
|
@@ -314,7 +312,10 @@ def _parse(self, parser_cls, options, url): | |
counter=ExecutorCounter(self), | ||
settings=ReadOnlySettings(self.settings), | ||
**options).parse() | ||
return list(res) | ||
if res: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 避免res返回空值后list函数错误 |
||
return list(res) | ||
else: | ||
return list() | ||
|
||
def _log_error(self, url, e): | ||
if self.logger: | ||
|
@@ -350,6 +351,7 @@ def _handle_error(self, url, e, pack=True): | |
self._error() | ||
raise UnitRetryFailed | ||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 添加行多余了。下同。 |
||
def _clear_error(self, url): | ||
if hasattr(url, 'error_times'): | ||
del url.error_times | ||
|
@@ -364,7 +366,7 @@ def _parse_with_process_exception(self, parser_cls, options, url): | |
kw = {'pages': 1, 'secs': t} | ||
self.counter_client.multi_local_inc(self.ip, self.id_, **kw) | ||
self.counter_client.multi_global_inc(**kw) | ||
|
||
self._clear_error(url) | ||
self._recover_normal() | ||
|
||
|
@@ -383,7 +385,7 @@ def _parse_with_process_exception(self, parser_cls, options, url): | |
except Exception, e: | ||
self._handle_error(url, e) | ||
|
||
return [url, ] | ||
return [url,] | ||
|
||
def execute(self, url, is_inc=False): | ||
failed = False | ||
|
@@ -402,20 +404,22 @@ def execute(self, url, is_inc=False): | |
parser_cls, options = self.job_desc.url_patterns.get_parser(url, options=True) | ||
if parser_cls is not None: | ||
if rates == 0: | ||
rates, span = self.speed_client.require( | ||
DEFAULT_SPEEED_REQUIRE_SIZE) | ||
rates, span = self.speed_client.require(DEFAULT_SPEEED_REQUIRE_SIZE) | ||
if rates == 0: | ||
if self.stopped.wait(5): | ||
return | ||
rates -= 1 | ||
|
||
try: | ||
next_urls = self._parse_with_process_exception( | ||
parser_cls, options, url) | ||
next_urls = self._parse_with_process_exception(parser_cls, options, url) | ||
next_urls = list(self.job_desc.url_patterns.matches(next_urls)) | ||
|
||
if next_urls: | ||
self.mq.put(next_urls) | ||
# inc budget if auto budget enabled | ||
if self.settings.job.size == 'auto': | ||
inc_budgets = len(next_urls) | ||
if inc_budgets > 0: | ||
self.budget_client.inc_budgets(inc_budgets) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里,如果next_urls当中存在已经抓取的,这个len(next_urls)就不等于待抓取的urls了吧。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里确实没有考虑到,似乎需要判断mq里put方法存放的url数量 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个地方会比较棘手一点,因为MessageQueueNodeProxy在put的时候,并没有真正做放进去的操作,而是对每个其他worker,做了一个cache,当cache满了的时候,才会flush出去。这样做的目的是为了减少网络间的传输开销。 所以,现在put方法并不知道真正放进去了多少个,而去重的操作是在mq的每个节点上才会去做的。 这里budgets的数量大于真实抓取的数量,会导致不能立刻结束么?如果不导致,这里应该就不用改了我理解。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 是的,如果budgets的数量大于真实抓取的数量,会导致JOB一直在等待状态。 好像也没有什么好办法,如果遇到有重复的URL,只能按run_loca_job的思路做。如果IDLE超时,自动结束,对吗?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 是的,我理解现在只能这样。 |
||
if hasattr(self.opener, 'close'): | ||
self.opener.close() | ||
|
||
|
@@ -458,8 +462,7 @@ def _parse(self, parser_cls, options, bundle, url): | |
|
||
def _log_error(self, bundle, url, e): | ||
if self.logger: | ||
self.logger.error('Error when handle bundle: %s, url: %s' % ( | ||
str(bundle), str(url))) | ||
self.logger.error('Error when handle bundle: %s, url: %s' % (str(bundle), str(url))) | ||
self.logger.exception(e) | ||
if url == getattr(bundle, 'error_url', None): | ||
bundle.error_times = getattr(bundle, 'error_times', 0) + 1 | ||
|
@@ -499,6 +502,9 @@ def _handle_error(self, bundle, url, e, pack=True): | |
|
||
if ignore: | ||
bundle.error_urls.append(url) | ||
# dec budget if auto budget enabled | ||
if self.settings.job.size == 'auto': | ||
self.budget_client.dec_budgets(1) | ||
return | ||
else: | ||
bundle.current_urls.insert(0, url) | ||
|
@@ -525,6 +531,7 @@ def _parse_with_process_exception(self, parser_cls, options, | |
self.counter_client.multi_local_inc(self.ip, self.id_, **kw) | ||
self.counter_client.multi_global_inc(**kw) | ||
|
||
|
||
self._clear_error(bundle) | ||
self._recover_normal() | ||
|
||
|
@@ -543,7 +550,7 @@ def _parse_with_process_exception(self, parser_cls, options, | |
except Exception, e: | ||
self._handle_error(bundle, url, e) | ||
|
||
return [url, ], [] | ||
return [url,], [] | ||
|
||
def execute(self, bundle, max_sec, is_inc=False): | ||
failed = False | ||
|
@@ -565,25 +572,22 @@ def execute(self, bundle, max_sec, is_inc=False): | |
|
||
url = bundle.current_urls.pop(0) | ||
if self.logger: | ||
self.logger.debug('get %s url: %s' % | ||
(bundle.label, url)) | ||
self.logger.debug('get %s url: %s' % (bundle.label, url)) | ||
|
||
rates = 0 | ||
span = 0.0 | ||
parser_cls, options = self.job_desc.url_patterns.get_parser(url, | ||
options=True) | ||
if parser_cls is not None: | ||
if rates == 0: | ||
rates, span = self.speed_client.require( | ||
DEFAULT_SPEEED_REQUIRE_SIZE) | ||
rates, span = self.speed_client.require(DEFAULT_SPEEED_REQUIRE_SIZE) | ||
if rates == 0: | ||
if self.stopped.wait(5): | ||
break | ||
rates -= 1 | ||
|
||
try: | ||
next_urls, bundles = self._parse_with_process_exception( | ||
parser_cls, options, bundle, url) | ||
next_urls, bundles = self._parse_with_process_exception(parser_cls, options, bundle, url) | ||
next_urls = list(self.job_desc.url_patterns.matches(next_urls)) | ||
next_urls.extend(bundle.current_urls) | ||
if self.shuffle_urls: | ||
|
@@ -597,6 +601,12 @@ def execute(self, bundle, max_sec, is_inc=False): | |
|
||
if bundles: | ||
self.mq.put(bundles) | ||
# inc budget if auto budget enabled | ||
if self.settings.job.size == 'auto': | ||
inc_budgets = len(bundles) | ||
if inc_budgets > 0: | ||
self.budget_client.inc_budgets(inc_budgets) | ||
|
||
if hasattr(self.opener, 'close'): | ||
self.opener.close() | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
同样,逗号后加空格。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
同意
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这些确定的改动(比如格式),可以先提一个commit修正下。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里空格,再提交改动下哈。