Bug #20629
closedRuby infinite loop when raising+rescuing inside a RUBY_INTERNAL_EVENT_NEWOBJ
Description
Background¶
Hey! While working on the Datadog Ruby Profiler I ran into a weird situation where the Ruby VM locks up as it gets stuck in an infinite loop in vm_sync.c:
	        while (recorded_lock_rec < current_lock_rec) {
	            RB_VM_LOCK_LEAVE_LEV(¤t_lock_rec);
	        }
...because leave is this:
	static inline void
	rb_vm_lock_leave(unsigned int *lev, const char *file, int line)
	{
	    if (rb_multi_ractor_p()) {
	        rb_vm_lock_leave_body(lev APPEND_LOCATION_PARAMS);
	    }
	}
...and rb_multi_ractor_p() is always false as I'm not using Ractors.
How to reproduce¶
(I include a zip with all the files needed to reproduce this, no need to copy-paste from here!)
Inside a native extension, I'm creating a RUBY_INTERNAL_EVENT_NEWOBJ. Inside it, I'm raising and catching an exception, to simulate a (handled) bug in our code:
#include <ruby.h>
#include <ruby/thread.h>
#include <ruby/thread_native.h>
#include <ruby/debug.h>
static VALUE raise_exception(VALUE unused) {
  fprintf(stderr, "Raising!\n");
  rb_raise(rb_eRuntimeError, "This is a test!");
}
static VALUE ignore_failure(VALUE self_instance, VALUE exception) {
  fprintf(stderr, "Caught!\n");
  return Qnil;
}
static void on_newobj_event(VALUE tracepoint_data, void *unused) {
  fprintf(stderr, "Inside on_newobj_event\n");
  // VALUE klass = rb_class_of(rb_tracearg_object(rb_tracearg_from_tracepoint(tracepoint_data)));
  // rb_p(klass);
  rb_rescue2(
    raise_exception,
    Qnil,
    ignore_failure,
    Qnil,
    rb_eException, // rb_eException is the base class of all Ruby exceptions
    0 // Required by API to be the last argument
  );
  fprintf(stderr, "Finished on_newobj_event\n");
}
static VALUE install_tracepoint(VALUE unused) {
  VALUE tp = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_NEWOBJ, on_newobj_event, NULL /* unused */);
  rb_tracepoint_enable(tp);
  fprintf(stderr, "Tracepoint installed!\n");
  return Qnil;
}
void Init_exception_bug_repro(void) {
  rb_define_singleton_method(rb_define_module("Repro"), "install_tracepoint", install_tracepoint, 0);
}
And my test Ruby script is:
require_relative './exception_bug_repro'
Repro.install_tracepoint
while true
  begin
    raise('hello')
  rescue
    print '.'
  end
end
Here's how running it looks:
$ bundle exec rake clean compile
$ bundle exec ruby testcase.rb
Tracepoint installed!
Inside on_newobj_event
Raising!
Caught!
Finished on_newobj_event
Inside on_newobj_event
Raising!
Caught!
Finished on_newobj_event
Inside on_newobj_event
Raising!
Caught!
Finished on_newobj_event
Inside on_newobj_event
Raising!
Caught!
Finished on_newobj_event
Inside on_newobj_event
Raising!
# <--- Infinite loop
That is, we should always see the raise -> caught, but sometimes the raise doesn't go very well, and from then Ruby eventually locks up.
Here's the full backtrace from gdb:
#0  rb_multi_ractor_p () at ruby-3_3/vm_sync.h:39
#1  0x00005fdde2534665 in rb_vm_lock_leave (lev=0x7ffc2f1b5f70, file=0x5fdde2783bb5 "vm_sync.c", line=231) at ruby-3_3/vm_sync.h:92
#2  0x00005fdde2534b28 in rb_ec_vm_lock_rec_release (ec=0x5fdde3e9b570, recorded_lock_rec=0, current_lock_rec=1) at vm_sync.c:231
#3  0x00005fdde23088f2 in rb_ec_vm_lock_rec_check (ec=0x5fdde3e9b570, recorded_lock_rec=0) at ruby-3_3/eval_intern.h:144
#4  0x00005fdde2308937 in rb_ec_tag_state (ec=0x5fdde3e9b570) at ruby-3_3/eval_intern.h:155
#5  0x00005fdde230c71e in setup_exception (ec=0x5fdde3e9b570, tag=6, mesg=126196681963080, cause=4) at eval.c:564
#6  0x00005fdde230cdc5 in rb_longjmp (ec=0x5fdde3e9b570, tag=6, mesg=126196681963080, cause=4) at eval.c:660
#7  0x00005fdde230d398 in rb_raise_jump (mesg=126196681963080, cause=36) at eval.c:853
#8  0x00005fdde230d0cc in rb_f_raise (argc=1, argv=0x72c690294048) at eval.c:752
#9  0x00005fdde230d0f4 in f_raise (c=1, v=0x72c690294048, _=126197147684320) at eval.c:789
#10 0x00005fdde2502d24 in ractor_safe_call_cfunc_m1 (recv=126197147684320, argc=1, argv=0x72c690294048, func=0x5fdde230d0cc <f_raise>) at ruby-3_3/vm_insnhelper.c:3311
#11 0x00005fdde250395b in vm_call_cfunc_with_frame_ (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0, argc=1, argv=0x72c690294048, stack_bottom=0x72c690294040)
    at ruby-3_3/vm_insnhelper.c:3502
#12 0x00005fdde2503b30 in vm_call_cfunc_with_frame (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:3530
#13 0x00005fdde2503c48 in vm_call_cfunc_other (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:3556
#14 0x00005fdde2504070 in vm_call_cfunc (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:3638
#15 0x00005fdde2506b9f in vm_call_method_each_type (ec=0x5fdde3e9b570, cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:4429
#16 0x00005fdde25076e0 in vm_call_method (ec=0x5fdde3e9b570, cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:4566
#17 0x00005fdde250784d in vm_call_general (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:4599
#18 0x00005fdde2509ed2 in vm_sendish (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, cd=0x5fdde3f4f7e0, block_handler=0, method_explorer=mexp_search_method)
    at ruby-3_3/vm_insnhelper.c:5593
#19 0x00005fdde25114b9 in vm_exec_core (ec=0x5fdde3e9b570) at ruby-3_3/insns.def:834
#20 0x00005fdde252829b in rb_vm_exec (ec=0x5fdde3e9b570) at vm.c:2486
#21 0x00005fdde25290fa in rb_iseq_eval_main (iseq=0x72c674667ad0) at vm.c:2752
#22 0x00005fdde230be59 in rb_ec_exec_node (ec=0x5fdde3e9b570, n=0x72c674667ad0) at eval.c:287
#23 0x00005fdde230bfd6 in ruby_run_node (n=0x72c674667ad0) at eval.c:328
#24 0x00005fdde2307245 in rb_main (argc=2, argv=0x7ffc2f1b7618) at ./main.c:39
#25 0x00005fdde23072bd in main (argc=2, argv=0x7ffc2f1b7618) at ./main.c:58
In this minimal reproducer, usually it's when allocating a Thread::Backtrace object that triggers the issue, but on the original codebase where I was experimenting I saw this issue even with other objects (I think??).
To be honest, maybe the problem is on the extension side -- this code assumes it's ok to raise/rescue in the RUBY_INTERNAL_EVENT_NEWOBJ tracepoint as it usually works fine, but maybe it's not?
Files
        
           Updated by alanwu (Alan Wu) over 1 year ago
          Updated by alanwu (Alan Wu) over 1 year ago
          
          
        
        
      
      - Status changed from Open to Closed
this code assumes it's ok to raise/rescue in the RUBY_INTERNAL_EVENT_NEWOBJ tracepoint as it usually works fine, but maybe it's not?
Right, it is not. The documentation for rb_tracepoint_new() is clear about this:
You can use any Ruby APIs (calling methods and so on) on normal event hooks.
However, in internal events, you can not use any Ruby APIs (even object
creations). This is why we can't specify internal events by TracePoint
directly.
        
           Updated by ivoanjo (Ivo Anjo) over 1 year ago
          Updated by ivoanjo (Ivo Anjo) over 1 year ago
          
          
        
        
      
      Thanks for pointing me in the right direction! I had completely forgotten that bit of info 😅. I guess I had to spend a few hours learning it the hard way.