Skip to content
This repository has been archived by the owner on May 4, 2018. It is now read-only.

Commit

Permalink
unix: revert recent FSEvent changes
Browse files Browse the repository at this point in the history
This commit reverts the following commits:

    983fa68 darwin: fix 10.6 build error in fsevents.c
    684e212 fsevents: use shared FSEventStream
    ea4cb77 fsevents: FSEvents is most likely not thread-safe
    9bae606 darwin: create fsevents thread on demand

Several people have reported stability issues on OS X 10.8 and bus
errors on the 10.9 developer preview.

See also nodejs/node-v0.x-archive#6296 and nodejs/node-v0.x-archive#6251.
  • Loading branch information
bnoordhuis committed Oct 5, 2013
1 parent 11d8011 commit 38df93c
Show file tree
Hide file tree
Showing 5 changed files with 270 additions and 532 deletions.
10 changes: 5 additions & 5 deletions include/uv-private/uv-darwin.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@

#define UV_PLATFORM_LOOP_FIELDS \
uv_thread_t cf_thread; \
void* _cf_reserved; \
void* cf_state; \
void* cf_cb; \
void* cf_loop; \
uv_mutex_t cf_mutex; \
uv_sem_t cf_sem; \
ngx_queue_t cf_signals; \
Expand All @@ -47,10 +47,10 @@
char* realpath; \
int realpath_len; \
int cf_flags; \
void* cf_event; \
void* cf_eventstream; \
uv_async_t* cf_cb; \
ngx_queue_t cf_member; \
uv_sem_t _cf_reserved; \
ngx_queue_t cf_events; \
uv_sem_t cf_sem; \
uv_mutex_t cf_mutex; \

#define UV_STREAM_PRIVATE_PLATFORM_FIELDS \
Expand Down
131 changes: 129 additions & 2 deletions src/unix/darwin.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,153 @@
#include <ifaddrs.h>
#include <net/if.h>

#include <CoreFoundation/CFRunLoop.h>

#include <mach/mach.h>
#include <mach/mach_time.h>
#include <mach-o/dyld.h> /* _NSGetExecutablePath */
#include <sys/resource.h>
#include <sys/sysctl.h>
#include <unistd.h> /* sysconf */

/* Forward declarations */
static void uv__cf_loop_runner(void* arg);
static void uv__cf_loop_cb(void* arg);

typedef struct uv__cf_loop_signal_s uv__cf_loop_signal_t;
struct uv__cf_loop_signal_s {
void* arg;
cf_loop_signal_cb cb;
ngx_queue_t member;
};


int uv__platform_loop_init(uv_loop_t* loop, int default_loop) {
loop->cf_state = NULL;
CFRunLoopSourceContext ctx;
int r;

if (uv__kqueue_init(loop))
return -1;

loop->cf_loop = NULL;
if ((r = uv_mutex_init(&loop->cf_mutex)))
return r;
if ((r = uv_sem_init(&loop->cf_sem, 0)))
return r;
ngx_queue_init(&loop->cf_signals);

memset(&ctx, 0, sizeof(ctx));
ctx.info = loop;
ctx.perform = uv__cf_loop_cb;
loop->cf_cb = CFRunLoopSourceCreate(NULL, 0, &ctx);

if ((r = uv_thread_create(&loop->cf_thread, uv__cf_loop_runner, loop)))
return r;

/* Synchronize threads */
uv_sem_wait(&loop->cf_sem);
assert(ACCESS_ONCE(CFRunLoopRef, loop->cf_loop) != NULL);

return 0;
}


void uv__platform_loop_delete(uv_loop_t* loop) {
uv__fsevents_loop_delete(loop);
ngx_queue_t* item;
uv__cf_loop_signal_t* s;

assert(loop->cf_loop != NULL);
uv__cf_loop_signal(loop, NULL, NULL);
uv_thread_join(&loop->cf_thread);

uv_sem_destroy(&loop->cf_sem);
uv_mutex_destroy(&loop->cf_mutex);

/* Free any remaining data */
while (!ngx_queue_empty(&loop->cf_signals)) {
item = ngx_queue_head(&loop->cf_signals);

s = ngx_queue_data(item, uv__cf_loop_signal_t, member);

ngx_queue_remove(item);
free(s);
}
}


static void uv__cf_loop_runner(void* arg) {
uv_loop_t* loop;

loop = arg;

/* Get thread's loop */
ACCESS_ONCE(CFRunLoopRef, loop->cf_loop) = CFRunLoopGetCurrent();

CFRunLoopAddSource(loop->cf_loop,
loop->cf_cb,
kCFRunLoopDefaultMode);

uv_sem_post(&loop->cf_sem);

CFRunLoopRun();

CFRunLoopRemoveSource(loop->cf_loop,
loop->cf_cb,
kCFRunLoopDefaultMode);
}


static void uv__cf_loop_cb(void* arg) {
uv_loop_t* loop;
ngx_queue_t* item;
ngx_queue_t split_head;
uv__cf_loop_signal_t* s;

loop = arg;

uv_mutex_lock(&loop->cf_mutex);
ngx_queue_init(&split_head);
if (!ngx_queue_empty(&loop->cf_signals)) {
ngx_queue_t* split_pos = ngx_queue_next(&loop->cf_signals);
ngx_queue_split(&loop->cf_signals, split_pos, &split_head);
}
uv_mutex_unlock(&loop->cf_mutex);

while (!ngx_queue_empty(&split_head)) {
item = ngx_queue_head(&split_head);

s = ngx_queue_data(item, uv__cf_loop_signal_t, member);

/* This was a termination signal */
if (s->cb == NULL)
CFRunLoopStop(loop->cf_loop);
else
s->cb(s->arg);

ngx_queue_remove(item);
free(s);
}
}


void uv__cf_loop_signal(uv_loop_t* loop, cf_loop_signal_cb cb, void* arg) {
uv__cf_loop_signal_t* item;

item = malloc(sizeof(*item));
/* XXX: Fail */
if (item == NULL)
abort();

item->arg = arg;
item->cb = cb;

uv_mutex_lock(&loop->cf_mutex);
ngx_queue_insert_tail(&loop->cf_signals, &item->member);
uv_mutex_unlock(&loop->cf_mutex);

assert(loop->cf_loop != NULL);
CFRunLoopSourceSignal(loop->cf_cb);
CFRunLoopWakeUp(loop->cf_loop);
}


Expand Down
Loading

20 comments on commit 38df93c

@indutny
Copy link
Contributor

@indutny indutny commented on 38df93c Oct 5, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What stability issues? I mean, please paste unfixed bugs here.

@bnoordhuis
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The hang that was reported in nodejs/node-v0.x-archive#6296. I know you wrote a patch for that but it doesn't look like it landed in v0.10.

@indutny
Copy link
Contributor

@indutny indutny commented on 38df93c Oct 5, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It'll be released, I guess /cc @tjfontaine.

So, besides that fixed issue and failure on not-yet-released OSX, you're no reported problems that are causing problems for real people.

@indutny
Copy link
Contributor

@indutny indutny commented on 38df93c Oct 5, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And there's a problem that was fixed by this commit, and which appeared to happen for quite notable amount of people.

@bnoordhuis
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, what I mean is that it's not there in the v0.10 branch of libuv. For that matter, I don't see it in the master branch either. The commit you link to in the PR is one of mine. Did you actually land it?

@indutny
Copy link
Contributor

@indutny indutny commented on 38df93c Oct 5, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bnoordhuis I'm afraid I forgot to push it, but it won't work after reverting this patch anyway.

@indutny
Copy link
Contributor

@indutny indutny commented on 38df93c Oct 5, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, should I push it to master, or will you revert this revert?

@bnoordhuis
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's cook it in master for a little while more.

@indutny
Copy link
Contributor

@indutny indutny commented on 38df93c Oct 5, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 429bb80.

@n1mmy
Copy link

@n1mmy n1mmy commented on 38df93c Oct 14, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi folks,

Even with this patch, we're still seeing stability issues with FSEvents and fs.watch.

When we released an upgrade using Node 0.10.20, several users started getting the error:

2013-10-10 20:52 node[15111] (CarbonCore.framework) FSEventStreamStart: register_with_server: ERROR: f2d_register_rpc() => (null) (-21)

as in nodejs/node-v0.x-archive#5463, along with the program exiting with "Abort trap 6". Sometimes the exit was immediate, sometimes it would take a few hours.

With Node 0.10.20, they would get one error print of this message. With 0.10.20 built with this patch to libuv, they would get 50 or so prints (one per fs.watch), but it would still crash.

Unfortunately, I have been unable to replicate this on my own machine, and do not have an exact recipe for replication. I do know that it occurs on at least OSX 10.8.5 and 10.8.3. And errors occur in Node 0.10.10, 0.10.17, 0.10.20, and 0.10.20+38df93cf.

We worked around this by simply no longer using fs.watch, and polling directory contents instead. This workaround is fine for us, but I thought I should give you a heads up that there still may be issues here. I know reports like this without reproductions aren't very actionable, but I'm not sure how to get any more useful information, and figured some info was better than none.

Thanks,
-- Nick

@bnoordhuis
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@n1mmy One of the patches that got reverted addresses that but unfortunately introduced other issues. Thanks for the heads up though.

One thing that could help us is a gdb backtrace of the abort; have your users enable core dumps with ulimit -c unlimited and run node until they hit that 'Abort trap 6' error, then open gdb with gdb path/to/node /cores/<corefile> and run thread apply all backtrace full.

@n1mmy
Copy link

@n1mmy n1mmy commented on 38df93c Oct 15, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @bnoordhuis. Thanks for the quick response!

I'll try and get a core and stack trace from an affected user. It's a little tough as many of them don't have XCode and gdb installed, but I'll ask around.

If you're referring to 684e212 fixing the FSEventStreamStart: register_with_server error, I think it may be a different problem with the same error message. Like I said, we still see that error message in 0.10.20 which includes 684e212. The difference is that the error only occurs once in 0.10.20, instead of N times both before 684e212 was introduced and after it was reverted. This is not consistent with the error being caused by watching more than 450 files, as described in nodejs/node-v0.x-archive#5463.

@n1mmy
Copy link

@n1mmy n1mmy commented on 38df93c Oct 15, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, here's a link to the issue where I ask for stack traces: meteor/meteor#1483 (comment)

@indutny
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hoya!

Most likely abort() is happening on this line: https://github.com/joyent/libuv/blob/master/src/unix/fsevents.c#L320 /cc @bnoordhuis.

@n1mmy, could it be that you're running a lot of instances of node.js, each fs.watch()ing directories? Previously, I've observed that on OSX you can have only certain, limited amount of watchers (451 on my machine, on other people's it was different).

@indutny
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bnoordhuis Lets try reincarnating old directory polling engine as a fallback to FSEvents. According to documentation it might fail at any time (and for no reason), and also it recommends polling directory in such case.

@bnoordhuis
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@n1mmy Thanks, cheers.

@indutny I guess I can live with that. The fallback should have the exact same semantics (or as close as humanly possible) as FSEvents though. Anything else is just confusing for users.

@iwehrman
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm seeing hangs (but not crashes) in calls to FSWatcher.close() on Mac 10.8, which seem like they might be related to this issue. There is a small test case here: https://gist.github.com/iwehrman/7013888
If I create a very small number of watchers (like 2 or 3) then everything works fine, but if I increase it to 5 then the calls to close() start hanging. The hang reproduces on node 0.10.18, 0.10.20, and 0.11.7, but not on 0.8.20.

Do you think this is the same issue? And, in either case, would it be worthwhile to file a separate bug about this?

@iwehrman
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also can't reproduce this with node 0.10.17, so the problem likely began with libuv 0.10.15 and node 0.10.18.

@bnoordhuis
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iwehrman The revert hasn't made it into a stable release yet. It'll be part of v0.10.21 which I expect will be released some time next week.

@jagill
Copy link

@jagill jagill commented on 38df93c Nov 2, 2013

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bnoordhuis I'm a meteor user that's getting this pretty consistently, but running the gdb command on the core dump pretty much freezes my computer (at least, after an hour it had given me no more dots, and i restarted). I have some core dumps that maybe I'll try to process over night?

The way I reproduce it is to have a meteor project running (with some reasonably large number of files), and then do a rm -rf on one of the directories. I'm using meteor 0.6.6.0 (which uses node v0.10.20) and I'm on OS X 10.8.5 build 12F45 . I've posted this info on the meteor bug as well.

We also have a command-line daemon that watches files that exits with Abort Trap 6 occasionally as well. I haven't gotten a core dump of this yet, but will try to reproduce it and get one. In this case, I'm using node v0.8.21.

Also, I'm in the bay area if one of your crew is in SF and wants to see it in action.

Edit:

jag@coati:~/tmp/closureTest$ uname -a
Darwin coati.local 12.5.0 Darwin Kernel Version 12.5.0: Sun Sep 29 13:33:47 PDT 2013; root:xnu-2050.48.12~1/RELEASE_X86_64 x86_64

Please sign in to comment.