Project

General

Profile

Feature #15331 » 0002-Hash-code-memoization-for-short-fstrings.patch

v6 - alanwu (Alan Wu), 11/26/2018 04:24 PM

View differences:

benchmark/freeze_unique_strings.yml
prelude: |
str = +"0000000"
benchmark:
freeze_unique_strings: |
str.succ!
-str
loop_count: 9000000 # freeze this many unique strings
benchmark/hash_aref_fstr.rb
h = {}
strs = ('a'..'z').to_a.map!(&:-@)
strs.each { |s| h[s] = s }
500_000.times { strs.each { |s| h[s] } }
benchmark/hash_aref_long_str.rb
h = {}
strs = ['a' * 100] * 10
strs.each { |s| h[s] = s }
200_000.times { strs.each { |s| h[s] } }
ext/-test-/string/cstr.c
rb_define_singleton_method(klass, "cstr_noembed", bug_str_s_cstr_noembed, 1);
rb_define_singleton_method(klass, "cstr_embedded?", bug_str_s_cstr_embedded_p, 1);
rb_define_singleton_method(klass, "rb_str_new_frozen", bug_str_s_rb_str_new_frozen, 1);
rb_define_const(klass, "HASH_MEMO_STR_BUFFER_CAPA", INT2NUM(HASH_MEMO_STR_BUFFER_CAPA));
}
internal.h
#define STR_SHARED_P(s) FL_ALL_RAW((s), STR_NOEMBED|ELTS_SHARED)
#define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
#define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
#define HASH_MEMO_STR_BUFFER_CAPA ((int) (SIZEOF_VALUE * 2))
size_t rb_str_memsize(VALUE);
VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, VALUE passed_proc);
VALUE rb_sym_to_proc(VALUE sym);
string.c
RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
} while (0)
struct memoized_hash_embeded_str {
char ary[HASH_MEMO_STR_BUFFER_CAPA];
st_index_t memoized_hash;
};
/* We need to have enough space for the string plus its encoding dependent terminator.
There are encodings that use two bytes, but we save on fetching the encoding
struct by being conservative. Change this when there is an encoding with 8-byte units.
See encindex.h. */
#define SHORT_ENOUGH_TO_MEMO_HASH(str, enc_idx) \
((RSTRING_EMBED_LEN(str) + ((enc_idx <= ENCINDEX_US_ASCII) ? 1 : 4)) <= HASH_MEMO_STR_BUFFER_CAPA)
STATIC_ASSERT(memoized_hash_type_punning, sizeof(struct memoized_hash_embeded_str) == sizeof(RSTRING(0)->as));
#define SET_HASH_MEMO(str, hash) do { \
struct RString *rstr = RSTRING(str);\
((struct memoized_hash_embeded_str*) &rstr->as)->memoized_hash = hash;\
} while (0)
#define GET_HASH_MEMO(str) (((struct memoized_hash_embeded_str*) &RSTRING(str)->as)->memoized_hash)
#define STR_SET_LEN(str, n) do { \
if (STR_EMBED_P(str)) {\
STR_SET_EMBED_LEN((str), (n));\
......
rb_str_hash,
};
struct fstr_update {
VALUE ret;
st_index_t hash;
};
#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_TAINT|FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
static int
fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
{
VALUE *fstr = (VALUE *)arg;
struct fstr_update *update = (struct fstr_update *)arg;
VALUE str = (VALUE)*key;
if (existing) {
......
* at next time */
if (rb_objspace_garbage_object_p(str)) {
*fstr = Qundef;
update->ret = Qundef;
return ST_DELETE;
}
*fstr = str;
update->ret = str;
return ST_STOP;
}
else {
......
str = str_new_frozen(rb_cString, str);
}
}
if (STR_EMBED_P(str) && SHORT_ENOUGH_TO_MEMO_HASH(str, ENCODING_GET_INLINED(str))) {
SET_HASH_MEMO(str, update->hash);
}
RBASIC(str)->flags |= RSTRING_FSTR;
*key = *value = *fstr = str;
*key = *value = str;
update->ret = str;
return ST_CONTINUE;
}
}
......
return fstr;
}
int
st_update_with_hash(st_table *tab, st_data_t key, st_index_t hash,
st_update_callback_func *func, st_data_t arg); /* st.c */
static VALUE
register_fstring(VALUE str)
{
VALUE ret;
struct fstr_update update;
st_table *frozen_strings = rb_vm_fstring_table();
st_index_t hash = rb_str_hash(str);
update.hash = hash;
do {
ret = str;
st_update(frozen_strings, (st_data_t)str,
fstr_update_callback, (st_data_t)&ret);
} while (ret == Qundef);
assert(OBJ_FROZEN(ret));
assert(!FL_TEST_RAW(ret, STR_FAKESTR));
assert(!FL_TEST_RAW(ret, FL_EXIVAR));
assert(!FL_TEST_RAW(ret, FL_TAINT));
assert(RBASIC_CLASS(ret) == rb_cString);
return ret;
update.ret = str;
st_update_with_hash(frozen_strings, (st_data_t)str, hash,
fstr_update_callback, (st_data_t)&update);
} while (update.ret == Qundef);
assert(OBJ_FROZEN(update.ret));
assert(!FL_TEST_RAW(update.ret, STR_FAKESTR));
assert(!FL_TEST_RAW(update.ret, FL_EXIVAR));
assert(!FL_TEST_RAW(update.ret, FL_TAINT));
assert(RBASIC_CLASS(update.ret) == rb_cString);
return update.ret;
}
static VALUE
......
str_make_independent_expand(str, len, 0L, termlen);
}
else {
if (UNLIKELY(termlen > 4)) {
rb_raise(rb_eArgError, "terminator is longer than 4 bytes");
}
TERM_FILL(s + len, termlen);
return s;
}
......
long capa = str_capacity(str, oldtermlen) + oldtermlen;
long len = RSTRING_LEN(str);
assert(termlen <= 4);
assert(capa >= len);
if (capa - len < termlen) {
rb_check_lockedtmp(str);
......
rb_str_hash(VALUE str)
{
int e = ENCODING_GET(str);
if (FL_TEST_RAW(str, RSTRING_FSTR | RSTRING_NOEMBED) == RSTRING_FSTR && SHORT_ENOUGH_TO_MEMO_HASH(str, e)) {
return GET_HASH_MEMO(str);
}
if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
e = 0;
}
test/-ext-/string/test_cstr.rb
WCHARS = [Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE]
def test_fstring_hash_memo_wchar
WCHARS.each_with_index do |encoding|
(1..Bug::String::HASH_MEMO_STR_BUFFER_CAPA).each do |i|
contrived_string = +('8' * i)
contrived_string.force_encoding(encoding)
before = contrived_string.hash
fstring = -(contrived_string)
assert Bug::String.cstr_embedded?(fstring)
# StringValueCStr writes a terminator past the end of the string buffer,
# which could clobber the memoized hash.
Bug::String.cstr_term(fstring)
assert_equal(before, fstring.hash)
end
end
end
def test_wchar_embed
WCHARS.each do |enc|
s = Bug::String.new("\u{4022}a".encode(enc))
(8-8/8)