Feature #12463
closedruby lacks plus-plus
Description
From c0f0d0c5f5c58d56fec95ca4303a0f5db5b54d56 Mon Sep 17 00:00:00 2001
From: "Urabe, Shyouhei" shyouhei@ruby-lang.org
Date: Mon, 6 Jun 2016 17:07:11 +0900
Subject: [PATCH 1/1] ruby lacks plus-plus operation.
This commit adds opt_plusplus VM instruction. Before this, a ruby
snippet foo += 1
was converted to up to 4 VM instructions. By
unifying these instructions into one, we can avoid unnecessary stack
manipulations and can issue dedicated machine instruction, if any.
This speeds up things. A set of microbenchmark results shows that it
gains up to 180% boost on a very tight loop.
raw data:
[["loop_whileloop",
[[0.624962, 0.579907, 0.562363, 0.579649, 0.589442, 0.595911, 0.583657],
[0.317089, 0.337159, 0.311376, 0.311721, 0.317585, 0.310999, 0.313107]]],
["loop_whileloop2",
[[0.129038, 0.137306, 0.13944, 0.121268, 0.120862, 0.121065, 0.135851],
[0.072271, 0.074998, 0.080031, 0.078602, 0.07527, 0.07627, 0.072029]]]]
Elapsed time: 7.775304 (sec)
benchmark results:
minimum results in each 7 measurements.
Execution time (sec)
name | trunk| ours
---------------+------+--------
loop_whileloop | 0.562| 0.311
loop_whileloop2| 0.121| 0.072
Speedup ratio: compare with the result of `trunk' (greater is better)
name | ours
---------------+------
loop_whileloop | 1.808
loop_whileloop2| 1.678
Also for a nontrivial benchmark, optcarrot speeds up from 31.14 fps to
31.98 fps (102.6%). This is not that huge gain (disappointed), but it
does speeds up more or less.
raw data:
[["optcarrot",
[[30.403074843558894, 30.310258944580042, 32.71944350007326],
[31.959367571755536, 31.721515210358586, 32.26915908959967]]]]
* configure.in (__builtin_add_overflow): check existence of this
compiler intrinsic which is available in recent gcc and clang,
with expectedly-optimal implementation of overflow detection.
* insns.def (opt_plusplus): new instruction. This unified
instruction is expected to appear when some local variable is
`+=1`ed. That was converted to 4 separate instructions before
(load variable, load immediate one, call plus, store variable).
* insns.def (opt_plus): refactor move implementation to share with
opt_plusplus.
* vm_insnhelper.c (opt_plus_func): moved to here.
* compile.c (iseq_peephole_optimize): generate opt_plusplus when
possible.
Signed-off-by: Urabe, Shyouhei <shyouhei@ruby-lang.org>
---
compile.c | 46 ++++++++++++++++++++++++
configure.in | 1 +
insns.def | 109 +++++++++++++++++++++++++++++++++++---------------------
vm_insnhelper.c | 62 ++++++++++++++++++++++++++++++++
4 files changed, 177 insertions(+), 41 deletions(-)
diff --git a/compile.c b/compile.c
index cc496cb..6d99d0b 100644
--- a/compile.c
+++ b/compile.c
@@ -2254,6 +2254,52 @@ iseq_peephole_optimize(rb_iseq_t *iseq, LINK_ELEMENT *list, const int do_tailcal
}
}
+ if (IS_INSN_ID(iobj, getlocal)) {
+ /*
+ * getlocal_OP__WC__0
+ * putobject_OP_INT2FIX_O_1_C_
+ * opt_plus
+ * setlocal_OP__WC__0
+ * =>
+ * opt_plusplus
+ */
+ INSN *nobj = (INSN *)iobj->link.next;
+ VALUE idx = OPERAND_AT(iobj, 0);
+ VALUE lev = OPERAND_AT(iobj, 1);
+ if (IS_INSN((LINK_ELEMENT *)nobj) &&
+ IS_INSN_ID(nobj, putobject)) {
+ if(OPERAND_AT(nobj, 0) == INT2FIX(1)) {
+ nobj = (INSN * )nobj->link.next;
+ if (IS_INSN((LINK_ELEMENT *)nobj) &&
+ IS_INSN_ID(nobj, send)) {
+ struct rb_call_info *ci = (void *)OPERAND_AT(nobj, 0);
+ struct rb_call_cache *cc = (void *)OPERAND_AT(nobj, 1);
+ if ((ci->flag & VM_CALL_ARGS_SIMPLE) &&
+ (ci->orig_argc == 1) &&
+ (ci->mid == idPLUS)) {
+ nobj = (INSN *)nobj->link.next;
+ if (IS_INSN((LINK_ELEMENT *)nobj) &&
+ IS_INSN_ID(nobj, setlocal)) {
+ if ((OPERAND_AT(nobj, 0) == idx) &&
+ (OPERAND_AT(nobj, 1) == lev)) {
+ INSN *opt_plusplus = new_insn_body(
+ iseq, iobj->line_no, BIN(opt_plusplus),
+ 4, idx, lev, (VALUE)ci, (VALUE)cc);
+ nobj = (INSN * )nobj->link.next;
+ REMOVE_ELEM(nobj->link.prev->prev->prev->prev);
+ REMOVE_ELEM(nobj->link.prev->prev->prev);
+ REMOVE_ELEM(nobj->link.prev->prev);
+ REMOVE_ELEM(nobj->link.prev);
+ INSERT_ELEM_PREV(
+ &nobj->link, (LINK_ELEMENT *)opt_plusplus);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
if (do_tailcallopt &&
(IS_INSN_ID(iobj, send) ||
IS_INSN_ID(iobj, opt_aref_with) ||
diff --git a/configure.in b/configure.in
index 063d839..e0b7a91 100644
--- a/configure.in
+++ b/configure.in
@@ -2480,6 +2480,7 @@ if test x$rb_cv_builtin___builtin_choose_expr = xyes; then
])
fi
RUBY_CHECK_BUILTIN_FUNC(__builtin_types_compatible_p, [__builtin_types_compatible_p(int, int)])
+RUBY_CHECK_BUILTIN_FUNC(__builtin_add_overflow, [__builtin_add_overflow(0, 0, (int*)0)])
if test "$ac_cv_func_qsort_r" != no; then
AC_CACHE_CHECK(whether qsort_r is GNU version, rb_cv_gnu_qsort_r,
diff --git a/insns.def b/insns.def
index cdc8287..1438e29 100644
--- a/insns.def
+++ b/insns.def
@@ -1377,47 +1377,7 @@ opt_plus
(VALUE recv, VALUE obj)
(VALUE val)
{
- if (FIXNUM_2_P(recv, obj) &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
- /* fixnum + fixnum */
-#ifndef LONG_LONG_VALUE
- val = (recv + (obj & (~1)));
- if ((~(recv ^ obj) & (recv ^ val)) &
- ((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
- val = rb_big_plus(rb_int2big(FIX2LONG(recv)),
- rb_int2big(FIX2LONG(obj)));
- }
-#else
- long a, b, c;
- a = FIX2LONG(recv);
- b = FIX2LONG(obj);
- c = a + b;
- val = LONG2NUM(c);
-#endif
- }
- else if (FLONUM_2_P(recv, obj) &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
- val = DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
- }
- else if (!SPECIAL_CONST_P(recv) && !SPECIAL_CONST_P(obj)) {
- if (RBASIC_CLASS(recv) == rb_cFloat && RBASIC_CLASS(obj) == rb_cFloat &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
- val = DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
- }
- else if (RBASIC_CLASS(recv) == rb_cString && RBASIC_CLASS(obj) == rb_cString &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, STRING_REDEFINED_OP_FLAG)) {
- val = rb_str_plus(recv, obj);
- }
- else if (RBASIC_CLASS(recv) == rb_cArray &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, ARRAY_REDEFINED_OP_FLAG)) {
- val = rb_ary_plus(recv, obj);
- }
- else {
- goto INSN_LABEL(normal_dispatch);
- }
- }
- else {
- INSN_LABEL(normal_dispatch):
+ if ((val = opt_plus_func(recv, obj)) == Qundef) {
PUSH(recv);
PUSH(obj);
CALL_SIMPLE_METHOD(recv);
@@ -2163,6 +2123,73 @@ opt_call_c_function
}
/**
+ @c optimize
+ @e unified instruction that does getlocal + putobject + opt_plus + setlocal.
+ */
+DEFINE_INSN
+opt_plusplus
+(lindex_t idx, rb_num_t level, CALL_INFO ci, CALL_CACHE cc)
+()
+()
+{
+ int const lv = level;
+ VALUE *ep = GET_EP();
+ VALUE *ptr;
+ VALUE tmp;
+ int i;
+
+ for (i = 0; i < lv; i++) {
+ ep = GET_PREV_EP(ep);
+ }
+ ptr = ep - idx;
+
+ if (FIXNUM_P(*ptr) &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
+#if defined LONG_LONG_VALUE
+ long a, b;
+ a = FIX2LONG(*ptr);
+ b = a + 1;
+ *ptr = LONG2NUM(b);
+#elif defined HAVE_BUILTIN___BUILTIN_ADD_OVERFLOW
+ const signed long one = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
+ signed long ret;
+ if (UNLIKELY(__builtin_saddl_overflow((signed long)*ptr, one, &ret))) {
+ *ptr = rb_big_plus(rb_int2big(FIX2LONG(*ptr)),
+ rb_int2big(FIX2LONG(1)));
+ }
+ else {
+ *ptr = (VALUE)ret;
+ }
+#else
+ const VALUE a, b, c;
+ a = *ptr;
+ b = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
+ c = a + b;
+ if ((~(a ^ b) & (a ^ c)) &
+ ((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
+ *ptr = rb_big_plus(rb_int2big(FIX2LONG(a)),
+ rb_int2big(FIX2LONG(b)));
+ }
+ else {
+ *ptr = c;
+ }
+#endif
+ }
+ else if ((tmp = opt_plus_func(*ptr, INT2FIX(1))) != Qundef) {
+ *ptr = tmp;
+ }
+ else {
+ VALUE val;
+ VALUE recv = *ptr;
+
+ PUSH(recv);
+ PUSH(INT2FIX(1));
+ CALL_SIMPLE_METHOD(recv);
+ *ptr = val;
+ }
+}
+
+/**
@c joke
@e BLT
@j BLT
diff --git a/vm_insnhelper.c b/vm_insnhelper.c
index 691c37a..4ff21c5 100644
--- a/vm_insnhelper.c
+++ b/vm_insnhelper.c
@@ -1209,6 +1209,68 @@ rb_equal_opt(VALUE obj1, VALUE obj2)
return opt_eq_func(obj1, obj2, &ci, &cc);
}
+static inline VALUE
+fixnum_plus_fixnum(VALUE i, VALUE j)
+{
+#if defined LONG_LONG_VALUE
+ long a, b, c;
+ a = FIX2LONG(i);
+ b = FIX2LONG(j);
+ c = a + b;
+ return LONG2NUM(c);
+#elif defined HAVE_BUILTIN___BUILTIN_ADD_OVERFLOW
+ signed long ret;
+ if (__builtin_saddl_overflow(i, j & ~RUBY_FIXNUM_FLAG, &ret)) {
+ return rb_big_plus(rb_int2big(FIX2LONG(i)),
+ rb_int2big(FIX2LONG(j)));
+ }
+ else {
+ return (VALUE)ret;
+ }
+#else
+ VALUE val = (i + (j & ~RUBY_FIXNUM_FLAG));
+ if ((~(i ^ j) & (i ^ val)) &
+ ((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
+ val = rb_big_plus(rb_int2big(FIX2LONG(i)),
+ rb_int2big(FIX2LONG(j)));
+ }
+ return val;
+#endif
+}
+
+static inline VALUE
+opt_plus_func(VALUE recv, VALUE obj)
+{
+ if (FIXNUM_2_P(recv, obj) &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
+ return fixnum_plus_fixnum(recv, obj);
+ }
+ else if (FLONUM_2_P(recv, obj) &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
+ return DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
+ }
+ else if (!SPECIAL_CONST_P(recv) && !SPECIAL_CONST_P(obj)) {
+ if (RBASIC_CLASS(recv) == rb_cFloat && RBASIC_CLASS(obj) == rb_cFloat &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
+ return DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
+ }
+ else if (RBASIC_CLASS(recv) == rb_cString && RBASIC_CLASS(obj) == rb_cString &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, STRING_REDEFINED_OP_FLAG)) {
+ return rb_str_plus(recv, obj);
+ }
+ else if (RBASIC_CLASS(recv) == rb_cArray &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, ARRAY_REDEFINED_OP_FLAG)) {
+ return rb_ary_plus(recv, obj);
+ }
+ else {
+ return Qundef;
+ }
+ }
+ else {
+ return Qundef;
+ }
+}
+
static VALUE vm_call0(rb_thread_t*, VALUE, ID, int, const VALUE*, const rb_callable_method_entry_t *);
static VALUE
--
2.8.3
Updated by nobu (Nobuyoshi Nakada) over 8 years ago
- Tracker changed from Bug to Feature
- Description updated (diff)
Updated by duerst (Martin Dürst) over 8 years ago
This looks like an interesting improvement. But in some way, a
a += 1
in a Ruby program may be a code smell (specifically, it smells of C and similar languages).
Updated by normalperson (Eric Wong) over 8 years ago
duerst@it.aoyama.ac.jp wrote:
This looks like an interesting improvement. But in some way, a
a += 1
in a Ruby program may be a code smell (specifically, it smells of C and similar languages).
Agreed, I'm not sure how often I see this to be a useful optimization.
Also for a nontrivial benchmark, optcarrot speeds up from 31.14 fps to
31.98 fps (102.6%). This is not that huge gain (disappointed), but it
does speeds up more or less.
Yes, that is small. What is optcarrot? I can't find it in trunk.
Also, what is performance impact for other benchmarks?
Do you notice a performance change (either micro or other
benchmarks) removing "inline"?
Mainly, I am uncomfortable about making vm_exec loop bigger and
blowing away icache.
Updated by shyouhei (Shyouhei Urabe) over 8 years ago
On 06/06/2016 10:44 PM, Eric Wong wrote:
Also for a nontrivial benchmark, optcarrot speeds up from 31.14 fps to
31.98 fps (102.6%). This is not that huge gain (disappointed), but it
does speeds up more or less.Yes, that is small. What is optcarrot? I can't find it in trunk.
Sorry about it. Optcarrot is a recently-developed pure-ruby NES simulator.
Can be fond here: git@github.com:mame/optcarrot.git
Also, what is performance impact for other benchmarks?
Attached. consult below for full benchmark output. I have not (and have
to) taken a closer look at each results but at a glance no benchmark
gained 200% speed up, nor 50% speed down. To put it better it has no
significant drawbacks.
Do you notice a performance change (either micro or other
benchmarks) removing "inline"?
No, not tested yet. I'll take a look.
Mainly, I am uncomfortable about making vm_exec loop bigger and
blowing away icache.
I understand your concern. I have to point out on the other hand that
this optimization shortens 4 VM instructions into 1, which must
positively impact dcache locality a bit (instruction sequence is very
frequently accessed). There is a tradeoff between them.
-----------------------------------------------------------
raw data:
[["app_answer", [[0.052123], [0.045075]]],
["app_aobench", [[51.100811], [61.442784]]],
["app_erb", [[0], [0]]],
["app_factorial", [[1.017833], [0.981952]]],
["app_fib", [[0.483506], [0.493121]]],
["app_lc_fizzbuzz", [[86.574658], [83.14404]]],
["app_mandelbrot", [[1.407061], [1.315739]]],
["app_pentomino", [[16.77479], [16.531438]]],
["app_raise", [[0.269551], [0.278264]]],
["app_strconcat", [[0.961209], [0.924087]]],
["app_tak", [[0.607776], [0.603985]]],
["app_tarai", [[0.504795], [0.504447]]],
["app_uri", [[0], [0]]],
["array_shift", [[0], [0]]],
["hash_aref_dsym", [[0.365131], [0.355265]]],
["hash_aref_dsym_long", [[12.73269], [13.330662]]],
["hash_aref_fix", [[0.39919], [0.352738]]],
["hash_aref_flo", [[0.075359], [0.090797]]],
["hash_aref_miss", [[0.517686], [0.529451]]],
["hash_aref_str", [[0.466861], [0.421541]]],
["hash_aref_sym", [[0.359693], [0.339002]]],
["hash_aref_sym_long", [[0.521086], [0.537503]]],
["hash_flatten", [[0.350155], [0.356782]]],
["hash_ident_flo", [[0.047239], [0.058664]]],
["hash_ident_num", [[0.312423], [0.331413]]],
["hash_ident_obj", [[0.343969], [0.30839]]],
["hash_ident_str", [[0.3327], [0.337857]]],
["hash_ident_sym", [[0.366507], [0.345687]]],
["hash_keys", [[0.356622], [0.354485]]],
["hash_shift", [[0.028791], [0.044413]]],
["hash_shift_u16", [[0.147648], [0.137591]]],
["hash_shift_u24", [[0.135581], [0.140135]]],
["hash_shift_u32", [[0.133544], [0.149953]]],
["hash_to_proc", [[0.015097], [0.015503]]],
["hash_values", [[0.326313], [0.338462]]],
["io_file_create", [[2.631183], [2.29139]]],
["io_file_read", [[0], [0]]],
["io_file_write", [[0], [0]]],
["io_nonblock_noex", [[0], [0]]],
["io_nonblock_noex2", [[0], [0]]],
["io_select", [[2.871379], [2.845252]]],
["io_select2", [[3.019507], [2.971023]]],
["io_select3", [[0.018271], [0.018546]]],
["loop_for", [[1.360109], [1.433324]]],
["loop_generator", [[0.644188], [0.651241]]],
["loop_times", [[1.260078], [1.259863]]],
["loop_whileloop", [[0.635374], [0.355271]]],
["loop_whileloop2", [[0.141063], [0.076761]]],
["marshal_dump_flo", [[0.523541], [0.498045]]],
["marshal_dump_load_geniv", [[0.89369], [0.862783]]],
["marshal_dump_load_time", [[1.238441], [1.180119]]],
["require", [[16.165019], [2.615595]]],
["require_thread", [[0.560172], [0.548627]]],
["securerandom", [[0], [0]]],
["so_ackermann", [[0.634471], [0.66092]]],
["so_array", [[1.073883], [1.068649]]],
["so_binary_trees", [[7.228263], [7.137076]]],
["so_concatenate", [[5.055297], [4.822921]]],
["so_count_words", [[0.216728], [0.173223]]],
["so_exception", [[0.348299], [0.312874]]],
["so_fannkuch", [[1.802251], [1.80874]]],
["so_fasta", [[1.57365], [1.562748]]],
["so_k_nucleotide", [[1.186867], [1.174061]]],
["so_lists", [[0.522591], [0.493595]]],
["so_mandelbrot", [[2.446802], [2.483779]]],
["so_matrix", [[0.553252], [0.537407]]],
["so_meteor_contest", [[3.098925], [3.09681]]],
["so_nbody", [[1.351588], [1.303507]]],
["so_nested_loop", [[1.025351], [1.060477]]],
["so_nsieve", [[1.84633], [1.748345]]],
["so_nsieve_bits", [[2.130051], [2.132071]]],
["so_object", [[0.637309], [0.646502]]],
["so_partial_sums", [[1.856853], [1.748393]]],
["so_pidigits", [[1.23942], [1.301111]]],
["so_random", [[0.39408], [0.35076]]],
["so_reverse_complement", [[1.571702], [1.644563]]],
["so_sieve", [[0.547596], [0.496421]]],
["so_spectralnorm", [[1.870886], [1.850356]]],
["vm1_attr_ivar", [[1.218386], [1.012984]]],
["vm1_attr_ivar_set", [[1.337383], [1.103195]]],
["vm1_block", [[1.996893], [1.86192]]],
["vm1_const", [[0.871864], [0.738432]]],
["vm1_ensure", [[0.633504], [0.405527]]],
["vm1_float_simple", [[4.575614], [4.468849]]],
["vm1_gc_short_lived", [[5.659463], [5.823397]]],
["vm1_gc_short_with_complex_long", [[6.430288], [6.456603]]],
["vm1_gc_short_with_long", [[7.00017], [6.849872]]],
["vm1_gc_short_with_symbol", [[5.526827], [5.366768]]],
["vm1_gc_wb_ary", [[1.148627], [0.95045]]],
["vm1_gc_wb_ary_promoted", [[1.177966], [0.945101]]],
["vm1_gc_wb_obj", [[1.024596], [0.767215]]],
["vm1_gc_wb_obj_promoted", [[1.150204], [0.942514]]],
["vm1_ivar", [[0.891796], [0.615384]]],
["vm1_ivar_set", [[0.872773], [0.657297]]],
["vm1_length", [[1.14291], [0.865646]]],
["vm1_lvar_init", [[1.906298], [1.72483]]],
["vm1_lvar_set", [[2.984877], [2.637209]]],
["vm1_neq", [[1.163216], [0.960243]]],
["vm1_not", [[0.949804], [0.650484]]],
["vm1_rescue", [[0.778971], [0.464327]]],
["vm1_simplereturn", [[1.238101], [0.982779]]],
["vm1_swap", [[0.928376], [0.674591]]],
["vm1_yield", [[1.348768], [1.099841]]],
["vm2_array", [[1.398596], [1.350792]]],
["vm2_bigarray", [[9.668123], [9.524274]]],
["vm2_bighash", [[7.155718], [7.151162]]],
["vm2_case", [[0.223288], [0.209747]]],
["vm2_case_lit", [[0.820809], [0.797681]]],
["vm2_defined_method", [[2.721788], [2.654055]]],
["vm2_dstr", [[1.09328], [1.042475]]],
["vm2_eval", [[31.762698], [32.069769]]],
["vm2_method", [[1.154035], [1.077522]]],
["vm2_method_missing", [[2.697381], [2.566442]]],
["vm2_method_with_block", [[1.432468], [1.219831]]],
["vm2_mutex", [[0.797908], [0.761003]]],
["vm2_newlambda", [[1.559347], [1.46937]]],
["vm2_poly_method", [[2.61575], [2.589814]]],
["vm2_poly_method_ov", [[0.295096], [0.23947]]],
["vm2_proc", [[0.570004], [0.53438]]],
["vm2_raise1", [[5.852356], [5.784271]]],
["vm2_raise2", [[8.396749], [8.350987]]],
["vm2_regexp", [[1.203123], [1.170023]]],
["vm2_send", [[0.469301], [0.420171]]],
["vm2_string_literal", [[0.344021], [0.263268]]],
["vm2_struct_big_aref_hi", [[0.284613], [0.240975]]],
["vm2_struct_big_aref_lo", [[0.289707], [0.238292]]],
["vm2_struct_big_aset", [[0.321701], [0.280001]]],
["vm2_struct_big_href_hi", [[0.371876], [0.354926]]],
["vm2_struct_big_href_lo", [[0.410041], [0.359201]]],
["vm2_struct_big_hset", [[0.414453], [0.344909]]],
["vm2_struct_small_aref", [[0.228413], [0.185585]]],
["vm2_struct_small_aset", [[0.304598], [0.273653]]],
["vm2_struct_small_href", [[0.353266], [0.314363]]],
["vm2_struct_small_hset", [[0.348498], [0.328763]]],
["vm2_super", [[0.547899], [0.490276]]],
["vm2_unif1", [[0.256021], [0.222816]]],
["vm2_zsuper", [[0.540286], [0.523564]]],
["vm3_backtrace", [[0.211558], [0.211667]]],
["vm3_clearmethodcache", [[0.518466], [0.546614]]],
["vm3_gc", [[1.47503], [1.481742]]],
["vm3_gc_old_full", [[3.070784], [3.0723]]],
["vm3_gc_old_immediate", [[2.735763], [2.888246]]],
["vm3_gc_old_lazy", [[4.444772], [3.881177]]],
["vm_symbol_block_pass", [[0.954599], [0.967853]]],
["vm_thread_alive_check1", [[0.217786], [0.20758]]],
["vm_thread_close", [[3.568386], [3.668023]]],
["vm_thread_create_join", [[2.599036], [2.579275]]],
["vm_thread_mutex1", [[0.620894], [0.573859]]],
["vm_thread_mutex2", [[0.954779], [0.915577]]],
["vm_thread_mutex3", [[165.541376], [159.570138]]],
["vm_thread_pass", [[0.691981], [0.763412]]],
["vm_thread_pass_flood", [[0.08634], [0.087652]]],
["vm_thread_pipe", [[0.372563], [0.383598]]],
["vm_thread_queue", [[0.127318], [0.11117]]]]
Elapsed time: 1162.336444 (sec)
-----------------------------------------------------------
benchmark results:
Execution time (sec)
name trunk ours
app_answer 0.052 0.045
app_aobench 51.101 61.443
app_erb 0.000 0.000
app_factorial 1.018 0.982
app_fib 0.484 0.493
app_lc_fizzbuzz 86.575 83.144
app_mandelbrot 1.407 1.316
app_pentomino 16.775 16.531
app_raise 0.270 0.278
app_strconcat 0.961 0.924
app_tak 0.608 0.604
app_tarai 0.505 0.504
app_uri 0.000 0.000
array_shift 0.000 0.000
hash_aref_dsym 0.365 0.355
hash_aref_dsym_long 12.733 13.331
hash_aref_fix 0.399 0.353
hash_aref_flo 0.075 0.091
hash_aref_miss 0.518 0.529
hash_aref_str 0.467 0.422
hash_aref_sym 0.360 0.339
hash_aref_sym_long 0.521 0.538
hash_flatten 0.350 0.357
hash_ident_flo 0.047 0.059
hash_ident_num 0.312 0.331
hash_ident_obj 0.344 0.308
hash_ident_str 0.333 0.338
hash_ident_sym 0.367 0.346
hash_keys 0.357 0.354
hash_shift 0.029 0.044
hash_shift_u16 0.148 0.138
hash_shift_u24 0.136 0.140
hash_shift_u32 0.134 0.150
hash_to_proc 0.015 0.016
hash_values 0.326 0.338
io_file_create 2.631 2.291
io_file_read 0.000 0.000
io_file_write 0.000 0.000
io_nonblock_noex 0.000 0.000
io_nonblock_noex2 0.000 0.000
io_select 2.871 2.845
io_select2 3.020 2.971
io_select3 0.018 0.019
loop_for 1.360 1.433
loop_generator 0.644 0.651
loop_times 1.260 1.260
loop_whileloop 0.635 0.355
loop_whileloop2 0.141 0.077
marshal_dump_flo 0.524 0.498
marshal_dump_load_geniv 0.894 0.863
marshal_dump_load_time 1.238 1.180
require 16.165 2.616
require_thread 0.560 0.549
securerandom 0.000 0.000
so_ackermann 0.634 0.661
so_array 1.074 1.069
so_binary_trees 7.228 7.137
so_concatenate 5.055 4.823
so_count_words 0.217 0.173
so_exception 0.348 0.313
so_fannkuch 1.802 1.809
so_fasta 1.574 1.563
so_k_nucleotide 1.187 1.174
so_lists 0.523 0.494
so_mandelbrot 2.447 2.484
so_matrix 0.553 0.537
so_meteor_contest 3.099 3.097
so_nbody 1.352 1.304
so_nested_loop 1.025 1.060
so_nsieve 1.846 1.748
so_nsieve_bits 2.130 2.132
so_object 0.637 0.647
so_partial_sums 1.857 1.748
so_pidigits 1.239 1.301
so_random 0.394 0.351
so_reverse_complement 1.572 1.645
so_sieve 0.548 0.496
so_spectralnorm 1.871 1.850
vm1_attr_ivar* 0.583 0.658
vm1_attr_ivar_set* 0.702 0.748
vm1_block* 1.362 1.507
vm1_const* 0.236 0.383
vm1_ensure* 0.000 0.050
vm1_float_simple* 3.940 4.114
vm1_gc_short_lived* 5.024 5.468
vm1_gc_short_with_complex_long* 5.795 6.101
vm1_gc_short_with_long* 6.365 6.495
vm1_gc_short_with_symbol* 4.891 5.011
vm1_gc_wb_ary* 0.513 0.595
vm1_gc_wb_ary_promoted* 0.543 0.590
vm1_gc_wb_obj* 0.389 0.412
vm1_gc_wb_obj_promoted* 0.515 0.587
vm1_ivar* 0.256 0.260
vm1_ivar_set* 0.237 0.302
vm1_length* 0.508 0.510
vm1_lvar_init* 1.271 1.370
vm1_lvar_set* 2.350 2.282
vm1_neq* 0.528 0.605
vm1_not* 0.314 0.295
vm1_rescue* 0.144 0.109
vm1_simplereturn* 0.603 0.628
vm1_swap* 0.293 0.319
vm1_yield* 0.713 0.745
vm2_array* 1.258 1.274
vm2_bigarray* 9.527 9.448
vm2_bighash* 7.015 7.074
vm2_case* 0.082 0.133
vm2_case_lit* 0.680 0.721
vm2_defined_method* 2.581 2.577
vm2_dstr* 0.952 0.966
vm2_eval* 31.622 31.993
vm2_method* 1.013 1.001
vm2_method_missing* 2.556 2.490
vm2_method_with_block* 1.291 1.143
vm2_mutex* 0.657 0.684
vm2_newlambda* 1.418 1.393
vm2_poly_method* 2.475 2.513
vm2_poly_method_ov* 0.154 0.163
vm2_proc* 0.429 0.458
vm2_raise1* 5.711 5.708
vm2_raise2* 8.256 8.274
vm2_regexp* 1.062 1.093
vm2_send* 0.328 0.343
vm2_string_literal* 0.203 0.187
vm2_struct_big_aref_hi* 0.144 0.164
vm2_struct_big_aref_lo* 0.149 0.162
vm2_struct_big_aset* 0.181 0.203
vm2_struct_big_href_hi* 0.231 0.278
vm2_struct_big_href_lo* 0.269 0.282
vm2_struct_big_hset* 0.273 0.268
vm2_struct_small_aref* 0.087 0.109
vm2_struct_small_aset* 0.164 0.197
vm2_struct_small_href* 0.212 0.238
vm2_struct_small_hset* 0.207 0.252
vm2_super* 0.407 0.414
vm2_unif1* 0.115 0.146
vm2_zsuper* 0.399 0.447
vm3_backtrace 0.212 0.212
vm3_clearmethodcache 0.518 0.547
vm3_gc 1.475 1.482
vm3_gc_old_full 3.071 3.072
vm3_gc_old_immediate 2.736 2.888
vm3_gc_old_lazy 4.445 3.881
vm_symbol_block_pass 0.955 0.968
vm_thread_alive_check1 0.218 0.208
vm_thread_close 3.568 3.668
vm_thread_create_join 2.599 2.579
vm_thread_mutex1 0.621 0.574
vm_thread_mutex2 0.955 0.916
vm_thread_mutex3 165.541 159.570
vm_thread_pass 0.692 0.763
vm_thread_pass_flood 0.086 0.088
vm_thread_pipe 0.373 0.384
vm_thread_queue 0.127 0.111
Speedup ratio: compare with the result of `trunk' (greater is better)
name ours
app_answer 1.156
app_aobench 0.832
app_erbError
app_factorial 1.037
app_fib 0.981
app_lc_fizzbuzz 1.041
app_mandelbrot 1.069
app_pentomino 1.015
app_raise 0.969
app_strconcat 1.040
app_tak 1.006
app_tarai 1.001
app_uriError
array_shiftError
hash_aref_dsym 1.028
hash_aref_dsym_long 0.955
hash_aref_fix 1.132
hash_aref_flo 0.830
hash_aref_miss 0.978
hash_aref_str 1.108
hash_aref_sym 1.061
hash_aref_sym_long 0.969
hash_flatten 0.981
hash_ident_flo 0.805
hash_ident_num 0.943
hash_ident_obj 1.115
hash_ident_str 0.985
hash_ident_sym 1.060
hash_keys 1.006
hash_shift 0.648
hash_shift_u16 1.073
hash_shift_u24 0.968
hash_shift_u32 0.891
hash_to_proc 0.974
hash_values 0.964
io_file_create 1.148
io_file_readError
io_file_writeError
io_nonblock_noexError
io_nonblock_noex2Error
io_select 1.009
io_select2 1.016
io_select3 0.985
loop_for 0.949
loop_generator 0.989
loop_times 1.000
loop_whileloop 1.788
loop_whileloop2 1.838
marshal_dump_flo 1.051
marshal_dump_load_geniv 1.036
marshal_dump_load_time 1.049
require 6.180
require_thread 1.021
securerandomError
so_ackermann 0.960
so_array 1.005
so_binary_trees 1.013
so_concatenate 1.048
so_count_words 1.251
so_exception 1.113
so_fannkuch 0.996
so_fasta 1.007
so_k_nucleotide 1.011
so_lists 1.059
so_mandelbrot 0.985
so_matrix 1.029
so_meteor_contest 1.001
so_nbody 1.037
so_nested_loop 0.967
so_nsieve 1.056
so_nsieve_bits 0.999
so_object 0.986
so_partial_sums 1.062
so_pidigits 0.953
so_random 1.124
so_reverse_complement 0.956
so_sieve 1.103
so_spectralnorm 1.011
vm1_attr_ivar* 0.886
vm1_attr_ivar_set* 0.939
vm1_block* 0.904
vm1_const* 0.617
vm1_ensure* 0.000
vm1_float_simple* 0.958
vm1_gc_short_lived* 0.919
vm1_gc_short_with_complex_long* 0.950
vm1_gc_short_with_long* 0.980
vm1_gc_short_with_symbol* 0.976
vm1_gc_wb_ary* 0.862
vm1_gc_wb_ary_promoted* 0.920
vm1_gc_wb_obj* 0.945
vm1_gc_wb_obj_promoted* 0.877
vm1_ivar* 0.986
vm1_ivar_set* 0.786
vm1_length* 0.994
vm1_lvar_init* 0.928
vm1_lvar_set* 1.030
vm1_neq* 0.873
vm1_not* 1.065
vm1_rescue* 1.317
vm1_simplereturn* 0.961
vm1_swap* 0.918
vm1_yield* 0.958
vm2_array* 0.987
vm2_bigarray* 1.008
vm2_bighash* 0.992
vm2_case* 0.618
vm2_case_lit* 0.943
vm2_defined_method* 1.001
vm2_dstr* 0.986
vm2_eval* 0.988
vm2_method* 1.012
vm2_method_missing* 1.027
vm2_method_with_block* 1.130
vm2_mutex* 0.960
vm2_newlambda* 1.018
vm2_poly_method* 0.985
vm2_poly_method_ov* 0.947
vm2_proc* 0.937
vm2_raise1* 1.001
vm2_raise2* 0.998
vm2_regexp* 0.971
vm2_send* 0.956
vm2_string_literal* 1.088
vm2_struct_big_aref_hi* 0.874
vm2_struct_big_aref_lo* 0.920
vm2_struct_big_aset* 0.889
vm2_struct_big_href_hi* 0.830
vm2_struct_big_href_lo* 0.952
vm2_struct_big_hset* 1.020
vm2_struct_small_aref* 0.803
vm2_struct_small_aset* 0.831
vm2_struct_small_href* 0.893
vm2_struct_small_hset* 0.823
vm2_super* 0.984
vm2_unif1* 0.787
vm2_zsuper* 0.894
vm3_backtrace 0.999
vm3_clearmethodcache 0.949
vm3_gc 0.995
vm3_gc_old_full 1.000
vm3_gc_old_immediate 0.947
vm3_gc_old_lazy 1.145
vm_symbol_block_pass 0.986
vm_thread_alive_check1 1.049
vm_thread_close 0.973
vm_thread_create_join 1.008
vm_thread_mutex1 1.082
vm_thread_mutex2 1.043
vm_thread_mutex3 1.037
vm_thread_pass 0.906
vm_thread_pass_flood 0.985
vm_thread_pipe 0.971
vm_thread_queue 1.145
Updated by ko1 (Koichi Sasada) over 8 years ago
I'm on weak negative with the following two points.
(1) there are only small improvement because only a few program uses "i+=1".
(2) I don't want that people prefer to use "while loop" because of performance.
Updated by naruse (Yui NARUSE) over 8 years ago
Koichi Sasada wrote:
I'm on weak negative with the following two points.
(1) there are only small improvement because only a few program uses "i+=1".
(2) I don't want that people prefer to use "while loop" because of performance.
People should use Integer#times for this while&++ case.
LTO can speed up int_dotimes() in 20%, but while loop is still 2 times faster.
Without LTO:
Performance counter stats for './miniruby --disable-gem -e100_000_000.times{}':
5153.515293 task-clock (msec) # 0.997 CPUs utilized
123 context-switches # 0.024 K/sec
1 cpu-migrations # 0.000 K/sec
873 page-faults # 0.169 K/sec
14894914453 cycles # 2.890 GHz
5256063515 stalled-cycles-frontend # 35.29% frontend cycles idle
<not supported> stalled-cycles-backend
37123557344 instructions # 2.49 insns per cycle
# 0.14 stalled cycles per insn
5304912019 branches # 1029.377 M/sec
202652 branch-misses # 0.00% of all branches
5.170935973 seconds time elapsed
With LTO:
Performance counter stats for './miniruby --disable-gem -e100_000_000.times{}':
4456.424594 task-clock (msec) # 0.997 CPUs utilized
269 context-switches # 0.060 K/sec
2 cpu-migrations # 0.000 K/sec
823 page-faults # 0.185 K/sec
13179265879 cycles # 2.957 GHz
3793118960 stalled-cycles-frontend # 28.78% frontend cycles idle
<not supported> stalled-cycles-backend
36422695065 instructions # 2.76 insns per cycle
# 0.10 stalled cycles per insn
5004776248 branches # 1123.047 M/sec
147323 branch-misses # 0.00% of all branches
4.471030258 seconds time elapsed
while loop:
Performance counter stats for './miniruby --disable-gem -ei=0;while i<100_000_000;i+=1;end':
2071.648179 task-clock (msec) # 0.998 CPUs utilized
35 context-switches # 0.017 K/sec
2 cpu-migrations # 0.001 K/sec
878 page-faults # 0.424 K/sec
6041423503 cycles # 2.916 GHz
2493938546 stalled-cycles-frontend # 41.28% frontend cycles idle
<not supported> stalled-cycles-backend
14520079387 instructions # 2.40 insns per cycle
# 0.17 stalled cycles per insn
1504280501 branches # 726.127 M/sec
116788 branch-misses # 0.01% of all branches
2.076396582 seconds time elapsed
Updated by rosenfeld (Rodrigo Rosenfeld Rosas) over 8 years ago
If ++ would be implemented as an atomic operation, then I'd be +1 for this as it's much easier to write the common pattern @counter++ than to @mutex.synchronize{@counter += 1} or being forced to implement a thread-safe counter since Ruby doesn't provide one in stdlib as far as I know.
Updated by shyouhei (Shyouhei Urabe) over 8 years ago
I don't think ++ is going to be atomic because in Ruby an Integer has infinite width in theory. To achieve this property a ++ must be implemented with proper reallocation of underlying memory region(s), which is very difficult (if not impossible) to do atomically.
Updated by rosenfeld (Rodrigo Rosenfeld Rosas) over 8 years ago
Or if performance is not a concern it could be simply implemented by using a mutex to perform the operation, right?
Updated by shyouhei (Shyouhei Urabe) over 8 years ago
I didn't say we don't need speed; I said it's difficult by design.
Updated by shyouhei (Shyouhei Urabe) over 8 years ago
Anyways, I tested following modification (against the proposed opt_plusplus) that tries to be atomic as far as no reallocation is involved. It works. But surprisingly, it slowed down. The problem was the inserted __atomic_thread_fence() intrinsic which eats a lot of CPU time.
So ++ being atomic is not only difficult-by-design, but also suffers considerable amount of overheads even if we could reroute the design issue.
From 88f11a18bfac404ade8db3673816c01fd2d64672 Mon Sep 17 00:00:00 2001
From: "Urabe, Shyouhei" <shyouhei@ruby-lang.org>
Date: Thu, 21 Jul 2016 12:07:41 +0900
Subject: [PATCH 1/1] increase atomicity of ++ operation if possible
When the receiver is an Integer first try to increment atomically.
If it overflows, give up atomicity and fall back to normal method
calls.
-----------------------------------------------------------
raw data:
[["loop_whileloop",
[[0.592233, 0.610019, 0.609028, 0.600098, 0.605186, 0.631371, 0.596622],
[0.874219, 0.8871, 0.853446, 0.896625, 0.881152, 0.87091, 0.889423]]],
["loop_whileloop2",
[[0.125931, 0.131164, 0.130166, 0.123759, 0.123483, 0.1308, 0.133648],
[0.191824, 0.188386, 0.174099, 0.182529, 0.182059, 0.183854, 0.193107]]]]
Elapsed time: 12.598961 (sec)
-----------------------------------------------------------
benchmark results:
minimum results in each 7 measurements.
Execution time (sec)
name trunk ours
loop_whileloop 0.592 0.853
loop_whileloop2 0.123 0.174
Speedup ratio: compare with the result of `trunk' (greater is better)
name ours
loop_whileloop 0.694
loop_whileloop2 0.709
Signed-off-by: Urabe, Shyouhei <shyouhei@ruby-lang.org>
---
insns.def | 35 ++++++++++++++++++-----------------
1 file changed, 18 insertions(+), 17 deletions(-)
diff --git a/insns.def b/insns.def
index 1438e29..6f71135 100644
--- a/insns.def
+++ b/insns.def
@@ -2145,33 +2145,34 @@ opt_plusplus
if (FIXNUM_P(*ptr) &&
BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
+ const signed long one = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
+
#if defined LONG_LONG_VALUE
- long a, b;
- a = FIX2LONG(*ptr);
- b = a + 1;
- *ptr = LONG2NUM(b);
+ InterlockedExchangeAdd64(ptr, one);
+ if (*ptr > FIXNUM_MAX) {
+ long a, b;
+ a = FIX2LONG(*ptr);
+ b = a + 1;
+ *ptr = LONG2NUM(b);
+ }
#elif defined HAVE_BUILTIN___BUILTIN_ADD_OVERFLOW
- const signed long one = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
- signed long ret;
- if (UNLIKELY(__builtin_saddl_overflow((signed long)*ptr, one, &ret))) {
+ signed long *p = (signed long *)ptr;
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
+ if (UNLIKELY(__builtin_saddl_overflow(*p, one, p))) {
*ptr = rb_big_plus(rb_int2big(FIX2LONG(*ptr)),
rb_int2big(FIX2LONG(1)));
}
- else {
- *ptr = (VALUE)ret;
- }
#else
const VALUE a, b, c;
- a = *ptr;
- b = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
- c = a + b;
+ do {
+ a = *ptr;
+ b = one;
+ c = a + b;
+ } while (ATOMIC_VALUE_CAS(ptr, a, c) != a);
if ((~(a ^ b) & (a ^ c)) &
((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
*ptr = rb_big_plus(rb_int2big(FIX2LONG(a)),
rb_int2big(FIX2LONG(b)));
}
- else {
- *ptr = c;
- }
#endif
}
--
2.9.2
Updated by shyouhei (Shyouhei Urabe) over 8 years ago
- Related to Feature #12607: Ruby needs an atomic integer added
Updated by rosenfeld (Rodrigo Rosenfeld Rosas) over 8 years ago
Thanks for the investigation. I understand there are trade-offs involved in most decisions. I'm just not sure how Ruby treats them. Would a ++ operator be more useful if it would be faster than +=1 or if it was some syntax sugar over an atomic increment? Anyway, I'm not against your original proposal, which targets performance. I just think I wouldn't ever use ++ for the purpose of improving the performance of my own code as I don't think it would be noticeable for any real use cases. I'd probably use it though but simply because it would be a syntax sugar over += 1 :)
Updated by ko1 (Koichi Sasada) over 7 years ago
- Status changed from Assigned to Rejected
I reject this ticket with the reason #5.