Project

General

Profile

Feature #15331 » 0001-Hash-code-memoization-for-short-fstrings-v5.patch

alanwu (Alan Wu), 11/26/2018 04:34 AM

View differences:

benchmark/freeze_unique_strings.yml
prelude: |
str = +"0000000"
benchmark:
freeze_unique_strings: |
str.succ!
-str
loop_count: 9000000 # freeze this many unique strings
benchmark/hash_aref_fstr.rb
h = {}
strs = ('a'..'z').to_a.map!(&:-@)
strs.each { |s| h[s] = s }
500_000.times { strs.each { |s| h[s] } }
benchmark/hash_aref_long_str.rb
h = {}
strs = ['a' * 100] * 10
strs.each { |s| h[s] = s }
200_000.times { strs.each { |s| h[s] } }
encindex.h
#define ENCINDEX_EUC_JP RUBY_ENCINDEX_EUC_JP
#define ENCINDEX_Windows_31J RUBY_ENCINDEX_Windows_31J
#define ENCINDEX_BUILTIN_MAX RUBY_ENCINDEX_BUILTIN_MAX
#define BUILTIN_SINGLE_BYTE_ENC_IDX_P(idx) (idx < RUBY_ENCINDEX_UTF_16BE)
#define rb_ascii8bit_encindex() RUBY_ENCINDEX_ASCII
#define rb_utf8_encindex() RUBY_ENCINDEX_UTF_8
ext/-test-/string/cstr.c
rb_define_singleton_method(klass, "cstr_noembed", bug_str_s_cstr_noembed, 1);
rb_define_singleton_method(klass, "cstr_embedded?", bug_str_s_cstr_embedded_p, 1);
rb_define_singleton_method(klass, "rb_str_new_frozen", bug_str_s_rb_str_new_frozen, 1);
rb_define_const(klass, "MEMO_HASH_LEN_MAX", INT2NUM(MEMO_HASH_LEN_MAX));
}
internal.h
char *rb_str_to_cstr(VALUE str);
VALUE rb_str_eql(VALUE str1, VALUE str2);
VALUE rb_obj_as_string_result(VALUE str, VALUE obj);
#define MEMO_HASH_LEN_MAX ((int) (sizeof(RSTRING(0)->as) - sizeof(st_index_t) - 1))
/* symbol.c */
#ifdef RUBY_ENCODING_H
st.c
{
}
ALWAYS_INLINE(static inline int
st_update_with_hash_inlined(st_table *tab, st_data_t key, st_data_t hash,
st_update_callback_func *func, st_data_t arg));
/* Find entry with KEY in table TAB, call FUNC with the key and the
value of the found entry, and non-zero as the 3rd argument. If the
entry is not found, call FUNC with KEY, and 2 zero arguments. If
the call returns ST_CONTINUE, the table will have an entry with key
and value returned by FUNC through the 1st and 2nd parameters. If
the call of FUNC returns ST_DELETE, the table will not have entry
with KEY. The function returns flag of that the entry with KEY was
value of the found entry, and non-zero as the 3rd argument. If the
entry is not found, call FUNC with KEY, and zero as the 3rd argument.
If the call returns ST_CONTINUE, the table will have an entry with key
and value returned by FUNC through the 1st and 2nd parameter.
If the call of FUNC returns ST_DELETE, the table will not have entry
with KEY. The function returns flag of that the entry with KEY was
in the table before the call. */
int
st_update(st_table *tab, st_data_t key,
st_update_callback_func *func, st_data_t arg)
{
st_hash_t hash = do_hash(key, tab);
return st_update_with_hash_inlined(tab, key, hash, func, arg);
}
/* st_update, but with caller-provided hash code for KEY. */
int
st_update_with_hash(st_table *tab, st_data_t key, st_index_t hash,
st_update_callback_func *func, st_data_t arg) {
/* This remapping is also done do_hash() */
st_hash_t remapped_hash = (hash == RESERVED_HASH_VAL) ? RESERVED_HASH_SUBSTITUTION_VAL : hash;
return st_update_with_hash_inlined(tab, key, remapped_hash, func, arg);
}
static inline int
st_update_with_hash_inlined(st_table *tab, st_data_t key, st_hash_t hash,
st_update_callback_func *func, st_data_t arg)
{
st_table_entry *entry = NULL; /* to avoid uninitialized value warning */
st_index_t bin = 0; /* Ditto */
......
st_data_t value = 0, old_key;
st_index_t check;
int retval, existing;
st_hash_t hash = do_hash(key, tab);
retry:
entries = tab->entries;
string.c
RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
} while (0)
#define MEMO_STR_BUFFER_CAPA (MEMO_HASH_LEN_MAX + 1)
struct memoized_hash_embeded_str {
char ary[MEMO_STR_BUFFER_CAPA];
st_index_t memoized_hash;
};
/* There are encodings that use two bytes, but we save on fetching the encoding
struct by being conservative. Change this when there is an encoding with 8-byte units. */
#define SHORT_ENOUGH_TO_MEMO_HASH(str, enc_idx) \
((RSTRING_EMBED_LEN(str) + (BUILTIN_SINGLE_BYTE_ENC_IDX_P(enc_idx) ? 1 : 4)) <= MEMO_STR_BUFFER_CAPA)
STATIC_ASSERT(memoized_hash_type_punning, sizeof(struct memoized_hash_embeded_str) == sizeof(RSTRING(0)->as));
#define SET_HASH_MEMO(str, hash) do { \
struct RString *rstr = RSTRING(str);\
((struct memoized_hash_embeded_str*) &rstr->as)->memoized_hash = hash;\
} while (0)
#define GET_HASH_MEMO(str) (((struct memoized_hash_embeded_str*) &RSTRING(str)->as)->memoized_hash)
#define STR_SET_LEN(str, n) do { \
if (STR_EMBED_P(str)) {\
STR_SET_EMBED_LEN((str), (n));\
......
rb_str_hash,
};
struct fstr_update {
VALUE ret;
st_index_t hash;
};
#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_TAINT|FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
static int
fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
{
VALUE *fstr = (VALUE *)arg;
struct fstr_update *update = (struct fstr_update *)arg;
VALUE str = (VALUE)*key;
if (existing) {
......
* at next time */
if (rb_objspace_garbage_object_p(str)) {
*fstr = Qundef;
update->ret = Qundef;
return ST_DELETE;
}
*fstr = str;
update->ret = str;
return ST_STOP;
}
else {
......
str = str_new_frozen(rb_cString, str);
}
}
if (STR_EMBED_P(str) && SHORT_ENOUGH_TO_MEMO_HASH(str, ENCODING_GET_INLINED(str))) {
SET_HASH_MEMO(str, update->hash);
}
RBASIC(str)->flags |= RSTRING_FSTR;
*key = *value = *fstr = str;
*key = *value = str;
update->ret = str;
return ST_CONTINUE;
}
}
......
return fstr;
}
int
st_update_with_hash(st_table *tab, st_data_t key, st_index_t hash,
st_update_callback_func *func, st_data_t arg); /* st.c */
static VALUE
register_fstring(VALUE str)
{
VALUE ret;
struct fstr_update update;
st_table *frozen_strings = rb_vm_fstring_table();
st_index_t hash = rb_str_hash(str);
update.hash = hash;
do {
ret = str;
st_update(frozen_strings, (st_data_t)str,
fstr_update_callback, (st_data_t)&ret);
} while (ret == Qundef);
assert(OBJ_FROZEN(ret));
assert(!FL_TEST_RAW(ret, STR_FAKESTR));
assert(!FL_TEST_RAW(ret, FL_EXIVAR));
assert(!FL_TEST_RAW(ret, FL_TAINT));
assert(RBASIC_CLASS(ret) == rb_cString);
return ret;
update.ret = str;
st_update_with_hash(frozen_strings, (st_data_t)str, hash,
fstr_update_callback, (st_data_t)&update);
} while (update.ret == Qundef);
assert(OBJ_FROZEN(update.ret));
assert(!FL_TEST_RAW(update.ret, STR_FAKESTR));
assert(!FL_TEST_RAW(update.ret, FL_EXIVAR));
assert(!FL_TEST_RAW(update.ret, FL_TAINT));
assert(RBASIC_CLASS(update.ret) == rb_cString);
return update.ret;
}
static VALUE
......
str_make_independent_expand(str, len, 0L, termlen);
}
else {
if (UNLIKELY(termlen > 4)) {
rb_raise(rb_eArgError, "terminator is longer than 4 bytes");
}
TERM_FILL(s + len, termlen);
return s;
}
......
long capa = str_capacity(str, oldtermlen) + oldtermlen;
long len = RSTRING_LEN(str);
assert(termlen <= 4);
assert(capa >= len);
if (capa - len < termlen) {
rb_check_lockedtmp(str);
......
rb_str_hash(VALUE str)
{
int e = ENCODING_GET(str);
if (FL_TEST_RAW(str, RSTRING_FSTR | RSTRING_NOEMBED) == RSTRING_FSTR && SHORT_ENOUGH_TO_MEMO_HASH(str, e)) {
return GET_HASH_MEMO(str);
}
if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
e = 0;
}
test/-ext-/string/test_cstr.rb
WCHARS = [Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE]
def test_fstring_hash_memo_wchar
WCHARS.each_with_index do |encoding, i|
contrived_string = +('8' * Bug::String::MEMO_HASH_LEN_MAX)
i.times { contrived_string.succ! } # make a unique string for each
contrived_string.force_encoding('UTF-32LE')
before = contrived_string.hash
fstring = -(contrived_string)
assert Bug::String.cstr_embedded?(fstring)
# StringValueCStr writes a terminator past the end of the string buffer,
# which could clobber the memoized hash.
Bug::String.cstr_term(fstring)
assert_equal(before, fstring.hash)
end
end
def test_wchar_embed
WCHARS.each do |enc|
s = Bug::String.new("\u{4022}a".encode(enc))
(6-6/8)