Bug #14181
closedhangs or deadlocks from waitpid, threads, and trapping SIGCHLD
Description
I'm not exactly sure what's going on here, but the end result is basically a thread is getting killed unexpectedly during a waitpid call, when SIGCHLD is being handled. In a more complex scenario, we end up hanging because Thread#join is ends up waiting on a thread that's already dead (presumably because it died in a non-standard way), or in a simpler scenario, the output is:
loop 250
loop 251
/usr/lib/ruby/2.4.0/timeout.rb:97:in `join': No live threads left. Deadlock? (fatal)
1 threads, 1 sleeps current:0x00000000019205e0 main thread:0x00000000019205e0
* #<Thread:0x0000000001955e38 sleep_forever>
rb_thread_t:0x00000000019205e0 native:0x00007f900a082700 int:0
/usr/lib/ruby/2.4.0/timeout.rb:97:in `join'
/usr/lib/ruby/2.4.0/timeout.rb:97:in `ensure in block in timeout'
/usr/lib/ruby/2.4.0/timeout.rb:97:in `block in timeout'
/usr/lib/ruby/2.4.0/timeout.rb:33:in `block in catch'
/usr/lib/ruby/2.4.0/timeout.rb:33:in `catch'
/usr/lib/ruby/2.4.0/timeout.rb:33:in `catch'
/usr/lib/ruby/2.4.0/timeout.rb:108:in `timeout'
./test.rb:11:in `<main>'
from /usr/lib/ruby/2.4.0/timeout.rb:97:in `ensure in block in timeout'
from /usr/lib/ruby/2.4.0/timeout.rb:97:in `block in timeout'
from /usr/lib/ruby/2.4.0/timeout.rb:33:in `block in catch'
from /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch'
from /usr/lib/ruby/2.4.0/timeout.rb:33:in `catch'
from /usr/lib/ruby/2.4.0/timeout.rb:108:in `timeout'
from ./test.rb:11:in `<main>'
The simpler repro, where I'm obviously not doing anything I shouldn't be doing in the signal handler:
#!/usr/bin/env ruby
require 'timeout'
trap(:CHLD) { }
x = 0
while true
puts "loop #{x += 1}"
pid = Process.spawn('sleep 1')
Timeout.timeout(30) do
Process.waitpid(pid)
end
end
A slightly more complex repro that I'm still pretty sure what I'm doing in the signal handler is okay, but ends up hanging:
#!/usr/bin/env ruby
require 'timeout'
self_pipe = IO.pipe
signal_queue = []
def wake_up(self_pipe)
self_pipe[1].write_nonblock('.', exception: false)
end
trap(:CHLD) { signal_queue << :CHLD; wake_up(self_pipe) }
signal_processor = Thread.new do
loop do
self_pipe[0].read(1)
signal_queue.pop
end
end
x = 0
while true
puts "loop #{x += 1}"
pid = Process.spawn('sleep 1')
Timeout.timeout(30) do
Process.waitpid(pid)
end
end
In either case, it can take many loops before it fails, up to a few hundred. I've reproed on both Ubuntu Xenial, and macOS 10.12.6 (the former with ruby 2.4.2, the latter with ruby 2.4.1).
Updated by normalperson (Eric Wong) about 7 years ago
Thanks. It seems the culprit is Timeout using Thread#join.
Our deadlock checking code might be buggy...
I write similar process management code all the time without
Timeout and never have this problem, so I'll investigate
(but no promises, it's tricky code)
Fwiw, I use waitpid(..., WNOHANG) in a loop until ECHLD, and
use select/IO#wait_*able + self-pipe to timeout). Looping
(with WNOHANG) is important, since SIGCHLD interrupts can be
merged.
Updated by normalperson (Eric Wong) about 7 years ago
cody@cutrer.us wrote:
pid = Process.spawn('sleep 1')
Btw, much faster to reproduce the problem with either
script using "true" instead of "sleep 1"
(And I'm falling asleep and unable to continue working on this tonight :<)
Updated by nobu (Nobuyoshi Nakada) about 7 years ago
It seems the signal trap causes thread switching then Process.waitpid
exits.
That means the target thread status can change during RUBY_VM_CHECK_INTS_BLOCKING
, but sleep_forever
doesn't consider the condition to wait at that moment.
diff --git a/thread.c b/thread.c
index baa50ea388..cc62ea3905 100644
--- a/thread.c
+++ b/thread.c
@@ -883,7 +883,13 @@ thread_join_sleep(VALUE arg)
while (target_th->status != THREAD_KILLED) {
if (forever) {
- sleep_forever(th, TRUE, FALSE);
+ th->status = THREAD_STOPPED_FOREVER;
+ th->vm->sleeper++;
+ rb_check_deadlock(th->vm);
+ native_sleep(th, 0);
+ th->vm->sleeper--;
+ RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
+ th->status = THREAD_RUNNABLE;
}
else {
double now = timeofday();
Updated by normalperson (Eric Wong) about 7 years ago
nobu@ruby-lang.org wrote:
It seems the signal trap causes thread switching then
Process.waitpid
exits.
That means the target thread status can change duringRUBY_VM_CHECK_INTS_BLOCKING
, butsleep_forever
doesn't consider the condition to wait at that moment.
Right, that seems correct. I was considering passing target_th
to sleep_forever last night to check target_th->status inside
sleep_forever; but I think sleep_forever is too complex already.
I guess the sleep_wait_for_interrupt path when !forever has the
same problem and might sleep too long..
Updated by nobu (Nobuyoshi Nakada) about 7 years ago
- Status changed from Open to Closed
Applied in changeset trunk|r61274.
thread.c: fix deadlock
- thread.c (thread_join_sleep): the target thread may exit during
RUBY_VM_CHECK_INTS_BLOCKING
, butsleep_forever
does not
consider the condition change to wait.
[ruby-core:84248] [Bug #14181]
Updated by nobu (Nobuyoshi Nakada) about 7 years ago
normalperson (Eric Wong) wrote:
I guess the sleep_wait_for_interrupt path when !forever has the
same problem and might sleep too long..
How about this patch?
diff --git i/thread.c w/thread.c
index cc62ea3905..138c26cb09 100644
--- i/thread.c
+++ w/thread.c
@@ -90,9 +90,13 @@ static VALUE sym_on_blocking;
static VALUE sym_never;
static ID id_locals;
-static void sleep_timeval(rb_thread_t *th, struct timeval time, int spurious_check);
-static void sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check);
-static void sleep_forever(rb_thread_t *th, int nodeadlock, int spurious_check);
+typedef int (*wakeup_cond_func)(void *);
+static void sleep_timeval(rb_thread_t *th, struct timeval time, int spurious_check,
+ wakeup_cond_func wake, void *warg);
+static void sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check,
+ wakeup_cond_func wake, void *warg);
+static void sleep_forever(rb_thread_t *th, int nodeadlock, int spurious_check,
+ wakeup_cond_func wake, void *warg);
static void rb_thread_sleep_deadly_allow_spurious_wakeup(void);
static double timeofday(void);
static int rb_threadptr_dead(rb_thread_t *th);
@@ -873,6 +877,13 @@ remove_from_join_list(VALUE arg)
return Qnil;
}
+static int
+wake_join(void *arg)
+{
+ rb_thread_t *target_th = arg;
+ return target_th->status == THREAD_KILLED;
+}
+
static VALUE
thread_join_sleep(VALUE arg)
{
@@ -883,13 +894,7 @@ thread_join_sleep(VALUE arg)
while (target_th->status != THREAD_KILLED) {
if (forever) {
- th->status = THREAD_STOPPED_FOREVER;
- th->vm->sleeper++;
- rb_check_deadlock(th->vm);
- native_sleep(th, 0);
- th->vm->sleeper--;
- RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
- th->status = THREAD_RUNNABLE;
+ sleep_forever(th, TRUE, FALSE, wake_join, target_th);
}
else {
double now = timeofday();
@@ -898,7 +903,7 @@ thread_join_sleep(VALUE arg)
thread_id_str(target_th));
return Qfalse;
}
- sleep_wait_for_interrupt(th, limit - now, 0);
+ sleep_wait_for_interrupt(th, limit - now, 0, wake_join, target_th);
}
thread_debug("thread_join: interrupted (thid: %"PRI_THREAD_ID", status: %s)\n",
thread_id_str(target_th), thread_status_name(target_th, TRUE));
@@ -1094,14 +1099,15 @@ double2timeval(double d)
}
static void
-sleep_forever(rb_thread_t *th, int deadlockable, int spurious_check)
+sleep_forever(rb_thread_t *th, int deadlockable, int spurious_check,
+ wakeup_cond_func wake, void *warg)
{
enum rb_thread_status prev_status = th->status;
enum rb_thread_status status = deadlockable ? THREAD_STOPPED_FOREVER : THREAD_STOPPED;
th->status = status;
RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
- while (th->status == status) {
+ while (!(wake && wake(warg)) && th->status == status) {
if (deadlockable) {
th->vm->sleeper++;
rb_check_deadlock(th->vm);
@@ -1135,7 +1141,8 @@ getclockofday(struct timeval *tp)
}
static void
-sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check)
+sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check,
+ wakeup_cond_func wake, void *warg)
{
struct timeval to, tvn;
enum rb_thread_status prev_status = th->status;
@@ -1156,7 +1163,7 @@ sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check)
th->status = THREAD_STOPPED;
RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
- while (th->status == THREAD_STOPPED) {
+ while (!(wake && wake(warg)) && th->status == THREAD_STOPPED) {
native_sleep(th, &tv);
RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
getclockofday(&tvn);
@@ -1180,21 +1187,21 @@ void
rb_thread_sleep_forever(void)
{
thread_debug("rb_thread_sleep_forever\n");
- sleep_forever(GET_THREAD(), FALSE, TRUE);
+ sleep_forever(GET_THREAD(), FALSE, TRUE, NULL, NULL);
}
void
rb_thread_sleep_deadly(void)
{
thread_debug("rb_thread_sleep_deadly\n");
- sleep_forever(GET_THREAD(), TRUE, TRUE);
+ sleep_forever(GET_THREAD(), TRUE, TRUE, NULL, NULL);
}
static void
rb_thread_sleep_deadly_allow_spurious_wakeup(void)
{
thread_debug("rb_thread_sleep_deadly_allow_spurious_wakeup\n");
- sleep_forever(GET_THREAD(), TRUE, FALSE);
+ sleep_forever(GET_THREAD(), TRUE, FALSE, NULL, NULL);
}
static double
@@ -1216,16 +1223,17 @@ timeofday(void)
}
static void
-sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check)
+sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check,
+ wakeup_cond_func wake, void *warg)
{
- sleep_timeval(th, double2timeval(sleepsec), spurious_check);
+ sleep_timeval(th, double2timeval(sleepsec), spurious_check, wake, warg);
}
void
rb_thread_wait_for(struct timeval time)
{
rb_thread_t *th = GET_THREAD();
- sleep_timeval(th, time, 1);
+ sleep_timeval(th, time, 1, NULL, NULL);
}
/*
diff --git i/thread_sync.c w/thread_sync.c
index 6eff5e759c..ca74fe5866 100644
--- i/thread_sync.c
+++ w/thread_sync.c
@@ -428,7 +428,7 @@ static VALUE
rb_mutex_wait_for(VALUE time)
{
struct timeval *t = (struct timeval *)time;
- sleep_timeval(GET_THREAD(), *t, 0); /* permit spurious check */
+ sleep_timeval(GET_THREAD(), *t, 0, NULL, NULL); /* permit spurious check */
return Qnil;
}
Updated by normalperson (Eric Wong) about 7 years ago
nobu@ruby-lang.org wrote:
normalperson (Eric Wong) wrote:
I guess the sleep_wait_for_interrupt path when !forever has the
same problem and might sleep too long..How about this patch?
That might be correct, but I don't like making the sleep_*
functions too complex and probably prefer them open-coded like
you did in r61274.
My mind already has trouble understanding the logic from
spurious_check/deadlockable on existing implementations :x
Updated by normalperson (Eric Wong) about 7 years ago
Eric Wong normalperson@yhbt.net wrote:
That might be correct, but I don't like making the sleep_*
functions too complex and probably prefer them open-coded like
you did in r61274.
Something like below (maybe more cleanups possible,
timeval is tedious to use...):
diff --git a/test/ruby/test_thread.rb b/test/ruby/test_thread.rb
index 5f64a08155..3695f6e4ea 100644
--- a/test/ruby/test_thread.rb
+++ b/test/ruby/test_thread.rb
@@ -1284,6 +1284,20 @@ def test_signal_at_join
end
end
end
+ n.times do
+ w = Thread.start { sleep 30 }
+ begin
+ f.puts
+ f.gets
+ ensure
+ w.kill
+ t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+ w.join(30)
+ t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
+ diff = t1 - t0
+ assert_operator diff, :<=, 2
+ end
+ end
end
};
end
diff --git a/thread.c b/thread.c
index cc62ea3905..1cdf119da1 100644
--- a/thread.c
+++ b/thread.c
@@ -91,7 +91,6 @@ static VALUE sym_never;
static ID id_locals;
static void sleep_timeval(rb_thread_t *th, struct timeval time, int spurious_check);
-static void sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check);
static void sleep_forever(rb_thread_t *th, int nodeadlock, int spurious_check);
static void rb_thread_sleep_deadly_allow_spurious_wakeup(void);
static double timeofday(void);
@@ -888,18 +887,22 @@ thread_join_sleep(VALUE arg)
rb_check_deadlock(th->vm);
native_sleep(th, 0);
th->vm->sleeper--;
- RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
- th->status = THREAD_RUNNABLE;
}
else {
double now = timeofday();
+ struct timeval tv;
+
if (now > limit) {
thread_debug("thread_join: timeout (thid: %"PRI_THREAD_ID")\n",
thread_id_str(target_th));
return Qfalse;
}
- sleep_wait_for_interrupt(th, limit - now, 0);
+ tv = double2timeval(limit - now);
+ th->status = THREAD_STOPPED;
+ native_sleep(th, &tv);
}
+ RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
+ th->status = THREAD_RUNNABLE;
thread_debug("thread_join: interrupted (thid: %"PRI_THREAD_ID", status: %s)\n",
thread_id_str(target_th), thread_status_name(target_th, TRUE));
}
@@ -1135,41 +1138,57 @@ getclockofday(struct timeval *tp)
}
static void
-sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check)
+timeval_add(struct timeval *dst, const struct timeval *tv)
{
- struct timeval to, tvn;
- enum rb_thread_status prev_status = th->status;
-
- getclockofday(&to);
- if (TIMEVAL_SEC_MAX - tv.tv_sec < to.tv_sec)
- to.tv_sec = TIMEVAL_SEC_MAX;
+ if (TIMEVAL_SEC_MAX - tv->tv_sec < dst->tv_sec)
+ dst->tv_sec = TIMEVAL_SEC_MAX;
else
- to.tv_sec += tv.tv_sec;
- if ((to.tv_usec += tv.tv_usec) >= 1000000) {
- if (to.tv_sec == TIMEVAL_SEC_MAX)
- to.tv_usec = 999999;
+ dst->tv_sec += tv->tv_sec;
+ if ((dst->tv_usec += tv->tv_usec) >= 1000000) {
+ if (dst->tv_sec == TIMEVAL_SEC_MAX)
+ dst->tv_usec = 999999;
else {
- to.tv_sec++;
- to.tv_usec -= 1000000;
+ dst->tv_sec++;
+ dst->tv_usec -= 1000000;
}
}
+}
+
+static int
+timeval_update_expire(struct timeval *tv, const struct timeval *to)
+{
+ struct timeval tvn;
+
+ getclockofday(&tvn);
+ if (to->tv_sec < tvn.tv_sec) return 1;
+ if (to->tv_sec == tvn.tv_sec && to->tv_usec <= tvn.tv_usec) return 1;
+ thread_debug("timeval_update_expire: "
+ "%"PRI_TIMET_PREFIX"d.%.6ld > %"PRI_TIMET_PREFIX"d.%.6ld\n",
+ (time_t)to->tv_sec, (long)to->tv_usec,
+ (time_t)tvn.tv_sec, (long)tvn.tv_usec);
+ tv->tv_sec = to->tv_sec - tvn.tv_sec;
+ if ((tv->tv_usec = to->tv_usec - tvn.tv_usec) < 0) {
+ --tv->tv_sec;
+ tv->tv_usec += 1000000;
+ }
+ return 0;
+}
+static void
+sleep_timeval(rb_thread_t *th, struct timeval tv, int spurious_check)
+{
+ struct timeval to;
+ enum rb_thread_status prev_status = th->status;
+
+ getclockofday(&to);
+ timeval_add(&to, &tv);
th->status = THREAD_STOPPED;
RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
while (th->status == THREAD_STOPPED) {
native_sleep(th, &tv);
RUBY_VM_CHECK_INTS_BLOCKING(th->ec);
- getclockofday(&tvn);
- if (to.tv_sec < tvn.tv_sec) break;
- if (to.tv_sec == tvn.tv_sec && to.tv_usec <= tvn.tv_usec) break;
- thread_debug("sleep_timeval: %"PRI_TIMET_PREFIX"d.%.6ld > %"PRI_TIMET_PREFIX"d.%.6ld\n",
- (time_t)to.tv_sec, (long)to.tv_usec,
- (time_t)tvn.tv_sec, (long)tvn.tv_usec);
- tv.tv_sec = to.tv_sec - tvn.tv_sec;
- if ((tv.tv_usec = to.tv_usec - tvn.tv_usec) < 0) {
- --tv.tv_sec;
- tv.tv_usec += 1000000;
- }
+ if (timeval_update_expire(&tv, &to))
+ break;
if (!spurious_check)
break;
}
@@ -1215,12 +1234,6 @@ timeofday(void)
}
}
-static void
-sleep_wait_for_interrupt(rb_thread_t *th, double sleepsec, int spurious_check)
-{
- sleep_timeval(th, double2timeval(sleepsec), spurious_check);
-}
-
void
rb_thread_wait_for(struct timeval time)
{
Updated by nobu (Nobuyoshi Nakada) about 7 years ago
LGTM, and timeval_add()
and timeval_update_expire()
seem to worth extracting first.
Updated by normalperson (Eric Wong) about 7 years ago
nobu@ruby-lang.org wrote:
LGTM, and
timeval_add()
andtimeval_update_expire()
seem
to worth extracting first.
Thanks for the review and you're right; they're not needed in
the final patch to fix the problem, I did not need to do timeval
arithmetic in thread_join_sleep after all.
Anyways, I split them into r61301 and r61302.
For 2.6; maybe we consider moving everything to timespec and
avoiding double/timeval for internal APIs