Feature #15331 » 0001-Hash-code-memoization-for-short-fstrings-v5.patch
benchmark/freeze_unique_strings.yml | ||
---|---|---|
prelude: |
|
||
str = +"0000000"
|
||
benchmark:
|
||
freeze_unique_strings: |
|
||
str.succ!
|
||
-str
|
||
loop_count: 9000000 # freeze this many unique strings
|
benchmark/hash_aref_fstr.rb | ||
---|---|---|
h = {}
|
||
strs = ('a'..'z').to_a.map!(&:-@)
|
||
strs.each { |s| h[s] = s }
|
||
500_000.times { strs.each { |s| h[s] } }
|
benchmark/hash_aref_long_str.rb | ||
---|---|---|
h = {}
|
||
strs = ['a' * 100] * 10
|
||
strs.each { |s| h[s] = s }
|
||
200_000.times { strs.each { |s| h[s] } }
|
encindex.h | ||
---|---|---|
#define ENCINDEX_EUC_JP RUBY_ENCINDEX_EUC_JP
|
||
#define ENCINDEX_Windows_31J RUBY_ENCINDEX_Windows_31J
|
||
#define ENCINDEX_BUILTIN_MAX RUBY_ENCINDEX_BUILTIN_MAX
|
||
#define BUILTIN_SINGLE_BYTE_ENC_IDX_P(idx) (idx < RUBY_ENCINDEX_UTF_16BE)
|
||
#define rb_ascii8bit_encindex() RUBY_ENCINDEX_ASCII
|
||
#define rb_utf8_encindex() RUBY_ENCINDEX_UTF_8
|
ext/-test-/string/cstr.c | ||
---|---|---|
rb_define_singleton_method(klass, "cstr_noembed", bug_str_s_cstr_noembed, 1);
|
||
rb_define_singleton_method(klass, "cstr_embedded?", bug_str_s_cstr_embedded_p, 1);
|
||
rb_define_singleton_method(klass, "rb_str_new_frozen", bug_str_s_rb_str_new_frozen, 1);
|
||
rb_define_const(klass, "MEMO_HASH_LEN_MAX", INT2NUM(MEMO_HASH_LEN_MAX));
|
||
}
|
internal.h | ||
---|---|---|
char *rb_str_to_cstr(VALUE str);
|
||
VALUE rb_str_eql(VALUE str1, VALUE str2);
|
||
VALUE rb_obj_as_string_result(VALUE str, VALUE obj);
|
||
#define MEMO_HASH_LEN_MAX ((int) (sizeof(RSTRING(0)->as) - sizeof(st_index_t) - 1))
|
||
/* symbol.c */
|
||
#ifdef RUBY_ENCODING_H
|
st.c | ||
---|---|---|
{
|
||
}
|
||
ALWAYS_INLINE(static inline int
|
||
st_update_with_hash_inlined(st_table *tab, st_data_t key, st_data_t hash,
|
||
st_update_callback_func *func, st_data_t arg));
|
||
/* Find entry with KEY in table TAB, call FUNC with the key and the
|
||
value of the found entry, and non-zero as the 3rd argument. If the
|
||
entry is not found, call FUNC with KEY, and 2 zero arguments. If
|
||
the call returns ST_CONTINUE, the table will have an entry with key
|
||
and value returned by FUNC through the 1st and 2nd parameters. If
|
||
the call of FUNC returns ST_DELETE, the table will not have entry
|
||
with KEY. The function returns flag of that the entry with KEY was
|
||
value of the found entry, and non-zero as the 3rd argument. If the
|
||
entry is not found, call FUNC with KEY, and zero as the 3rd argument.
|
||
If the call returns ST_CONTINUE, the table will have an entry with key
|
||
and value returned by FUNC through the 1st and 2nd parameter.
|
||
If the call of FUNC returns ST_DELETE, the table will not have entry
|
||
with KEY. The function returns flag of that the entry with KEY was
|
||
in the table before the call. */
|
||
int
|
||
st_update(st_table *tab, st_data_t key,
|
||
st_update_callback_func *func, st_data_t arg)
|
||
{
|
||
st_hash_t hash = do_hash(key, tab);
|
||
return st_update_with_hash_inlined(tab, key, hash, func, arg);
|
||
}
|
||
/* st_update, but with caller-provided hash code for KEY. */
|
||
int
|
||
st_update_with_hash(st_table *tab, st_data_t key, st_index_t hash,
|
||
st_update_callback_func *func, st_data_t arg) {
|
||
/* This remapping is also done do_hash() */
|
||
st_hash_t remapped_hash = (hash == RESERVED_HASH_VAL) ? RESERVED_HASH_SUBSTITUTION_VAL : hash;
|
||
return st_update_with_hash_inlined(tab, key, remapped_hash, func, arg);
|
||
}
|
||
static inline int
|
||
st_update_with_hash_inlined(st_table *tab, st_data_t key, st_hash_t hash,
|
||
st_update_callback_func *func, st_data_t arg)
|
||
{
|
||
st_table_entry *entry = NULL; /* to avoid uninitialized value warning */
|
||
st_index_t bin = 0; /* Ditto */
|
||
... | ... | |
st_data_t value = 0, old_key;
|
||
st_index_t check;
|
||
int retval, existing;
|
||
st_hash_t hash = do_hash(key, tab);
|
||
retry:
|
||
entries = tab->entries;
|
string.c | ||
---|---|---|
RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
|
||
} while (0)
|
||
#define MEMO_STR_BUFFER_CAPA (MEMO_HASH_LEN_MAX + 1)
|
||
struct memoized_hash_embeded_str {
|
||
char ary[MEMO_STR_BUFFER_CAPA];
|
||
st_index_t memoized_hash;
|
||
};
|
||
/* There are encodings that use two bytes, but we save on fetching the encoding
|
||
struct by being conservative. Change this when there is an encoding with 8-byte units. */
|
||
#define SHORT_ENOUGH_TO_MEMO_HASH(str, enc_idx) \
|
||
((RSTRING_EMBED_LEN(str) + (BUILTIN_SINGLE_BYTE_ENC_IDX_P(enc_idx) ? 1 : 4)) <= MEMO_STR_BUFFER_CAPA)
|
||
STATIC_ASSERT(memoized_hash_type_punning, sizeof(struct memoized_hash_embeded_str) == sizeof(RSTRING(0)->as));
|
||
#define SET_HASH_MEMO(str, hash) do { \
|
||
struct RString *rstr = RSTRING(str);\
|
||
((struct memoized_hash_embeded_str*) &rstr->as)->memoized_hash = hash;\
|
||
} while (0)
|
||
#define GET_HASH_MEMO(str) (((struct memoized_hash_embeded_str*) &RSTRING(str)->as)->memoized_hash)
|
||
#define STR_SET_LEN(str, n) do { \
|
||
if (STR_EMBED_P(str)) {\
|
||
STR_SET_EMBED_LEN((str), (n));\
|
||
... | ... | |
rb_str_hash,
|
||
};
|
||
struct fstr_update {
|
||
VALUE ret;
|
||
st_index_t hash;
|
||
};
|
||
#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_TAINT|FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
|
||
static int
|
||
fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
|
||
{
|
||
VALUE *fstr = (VALUE *)arg;
|
||
struct fstr_update *update = (struct fstr_update *)arg;
|
||
VALUE str = (VALUE)*key;
|
||
if (existing) {
|
||
... | ... | |
* at next time */
|
||
if (rb_objspace_garbage_object_p(str)) {
|
||
*fstr = Qundef;
|
||
update->ret = Qundef;
|
||
return ST_DELETE;
|
||
}
|
||
*fstr = str;
|
||
update->ret = str;
|
||
return ST_STOP;
|
||
}
|
||
else {
|
||
... | ... | |
str = str_new_frozen(rb_cString, str);
|
||
}
|
||
}
|
||
if (STR_EMBED_P(str) && SHORT_ENOUGH_TO_MEMO_HASH(str, ENCODING_GET_INLINED(str))) {
|
||
SET_HASH_MEMO(str, update->hash);
|
||
}
|
||
RBASIC(str)->flags |= RSTRING_FSTR;
|
||
*key = *value = *fstr = str;
|
||
*key = *value = str;
|
||
update->ret = str;
|
||
return ST_CONTINUE;
|
||
}
|
||
}
|
||
... | ... | |
return fstr;
|
||
}
|
||
int
|
||
st_update_with_hash(st_table *tab, st_data_t key, st_index_t hash,
|
||
st_update_callback_func *func, st_data_t arg); /* st.c */
|
||
static VALUE
|
||
register_fstring(VALUE str)
|
||
{
|
||
VALUE ret;
|
||
struct fstr_update update;
|
||
st_table *frozen_strings = rb_vm_fstring_table();
|
||
st_index_t hash = rb_str_hash(str);
|
||
update.hash = hash;
|
||
do {
|
||
ret = str;
|
||
st_update(frozen_strings, (st_data_t)str,
|
||
fstr_update_callback, (st_data_t)&ret);
|
||
} while (ret == Qundef);
|
||
assert(OBJ_FROZEN(ret));
|
||
assert(!FL_TEST_RAW(ret, STR_FAKESTR));
|
||
assert(!FL_TEST_RAW(ret, FL_EXIVAR));
|
||
assert(!FL_TEST_RAW(ret, FL_TAINT));
|
||
assert(RBASIC_CLASS(ret) == rb_cString);
|
||
return ret;
|
||
update.ret = str;
|
||
st_update_with_hash(frozen_strings, (st_data_t)str, hash,
|
||
fstr_update_callback, (st_data_t)&update);
|
||
} while (update.ret == Qundef);
|
||
assert(OBJ_FROZEN(update.ret));
|
||
assert(!FL_TEST_RAW(update.ret, STR_FAKESTR));
|
||
assert(!FL_TEST_RAW(update.ret, FL_EXIVAR));
|
||
assert(!FL_TEST_RAW(update.ret, FL_TAINT));
|
||
assert(RBASIC_CLASS(update.ret) == rb_cString);
|
||
return update.ret;
|
||
}
|
||
static VALUE
|
||
... | ... | |
str_make_independent_expand(str, len, 0L, termlen);
|
||
}
|
||
else {
|
||
if (UNLIKELY(termlen > 4)) {
|
||
rb_raise(rb_eArgError, "terminator is longer than 4 bytes");
|
||
}
|
||
TERM_FILL(s + len, termlen);
|
||
return s;
|
||
}
|
||
... | ... | |
long capa = str_capacity(str, oldtermlen) + oldtermlen;
|
||
long len = RSTRING_LEN(str);
|
||
assert(termlen <= 4);
|
||
assert(capa >= len);
|
||
if (capa - len < termlen) {
|
||
rb_check_lockedtmp(str);
|
||
... | ... | |
rb_str_hash(VALUE str)
|
||
{
|
||
int e = ENCODING_GET(str);
|
||
if (FL_TEST_RAW(str, RSTRING_FSTR | RSTRING_NOEMBED) == RSTRING_FSTR && SHORT_ENOUGH_TO_MEMO_HASH(str, e)) {
|
||
return GET_HASH_MEMO(str);
|
||
}
|
||
if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
|
||
e = 0;
|
||
}
|
test/-ext-/string/test_cstr.rb | ||
---|---|---|
WCHARS = [Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE]
|
||
def test_fstring_hash_memo_wchar
|
||
WCHARS.each_with_index do |encoding, i|
|
||
contrived_string = +('8' * Bug::String::MEMO_HASH_LEN_MAX)
|
||
i.times { contrived_string.succ! } # make a unique string for each
|
||
contrived_string.force_encoding('UTF-32LE')
|
||
before = contrived_string.hash
|
||
fstring = -(contrived_string)
|
||
assert Bug::String.cstr_embedded?(fstring)
|
||
# StringValueCStr writes a terminator past the end of the string buffer,
|
||
# which could clobber the memoized hash.
|
||
Bug::String.cstr_term(fstring)
|
||
assert_equal(before, fstring.hash)
|
||
end
|
||
end
|
||
def test_wchar_embed
|
||
WCHARS.each do |enc|
|
||
s = Bug::String.new("\u{4022}a".encode(enc))
|