Skip to content

Commit

Permalink
YJIT: Enhance the String#<< method substitution to handle integer c…
Browse files Browse the repository at this point in the history
…odepoint values.
  • Loading branch information
nirvdrum committed Jul 9, 2024
1 parent 6f6aff5 commit 00cc8e4
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 7 deletions.
1 change: 1 addition & 0 deletions internal/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 31,7 @@ enum ruby_rstring_private_flags {
VALUE rb_fstring(VALUE);
VALUE rb_fstring_cstr(const char *str);
VALUE rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc);
void rb_str_buf_cat_byte(VALUE str, unsigned char byte);
int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p);
int rb_str_symname_p(VALUE);
VALUE rb_str_quote_unprintable(VALUE);
Expand Down
2 changes: 1 addition & 1 deletion string.c
Original file line number Diff line number Diff line change
Expand Up @@ -3341,7 3341,7 @@ rb_str_cat_cstr(VALUE str, const char *ptr)
return rb_str_buf_cat(str, ptr, strlen(ptr));
}

static void
void
rb_str_buf_cat_byte(VALUE str, unsigned char byte)
{
RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
Expand Down
1 change: 1 addition & 0 deletions yjit/bindgen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 228,7 @@ fn main() {
.allowlist_function("rb_str_concat_literals")
.allowlist_function("rb_obj_as_string_result")
.allowlist_function("rb_str_byte_substr")
.allowlist_function("rb_str_buf_cat_byte")

// From include/ruby/internal/intern/parse.h
.allowlist_function("rb_backref_get")
Expand Down
107 changes: 101 additions & 6 deletions yjit/src/codegen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5754,23 5754,118 @@ fn jit_rb_str_empty_p(
return true;
}

// Codegen for rb_str_concat() -- *not* String#concat
// Frequently strings are concatenated using "out_str << next_str".
// This is common in Erb and similar templating languages.
fn jit_rb_str_concat(
// Codegen for rb_str_concat() with an integer argument -- *not* String#concat
// Using strings as a byte buffer often includes appending byte values to the end of the string.
fn jit_rb_str_concat_codepoint(
jit: &mut JITState,
asm: &mut Assembler,
_ci: *const rb_callinfo,
_cme: *const rb_callable_method_entry_t,
_block: Option<BlockHandler>,
_block: Option<crate::codegen::BlockHandler>,
_argc: i32,
_known_recv_class: Option<VALUE>,
) -> bool {
asm_comment!(asm, "String#<< with codepoint argument");

// Ensure the codepoint argument is a Fixnum.
let arg = asm.stack_opnd(0);
let comptime_arg = jit.peek_at_stack(&asm.ctx, 0);
if comptime_arg.fixnum_p() {
jit_guard_known_klass(
jit,
asm,
comptime_arg.class_of(),
arg,
arg.into(),
comptime_arg,
SEND_MAX_DEPTH,
Counter::guard_send_not_fixnums,
);
} else {
return false;
}

// Either of the string concatenation functions we call will reallocate the string to grow its
// capacity if necessary. In extremely rare cases (i.e., string exceeds `LONG_MAX` bytes),
// either of the called functions will raise an exception.
jit_prepare_non_leaf_call(jit, asm);

// Explicitly spill temps before making any C calls. `ccall` will spill temps, but it does a
// check to only spill if it thinks it's necessary. That logic can't see through the runtime
// branching occurring in the code generated for this function. Consequently, the branch for
// the first `ccall` will spill registers but the second one will not. At run time, we may
// jump over that spill code when executing the second branch, leading situations that are
// quite hard to debug. If we spill up front we avoid diverging behavior.
asm.spill_temps();

let codepoint = asm.stack_opnd(0);
let recv = asm.stack_opnd(1);

// In order to use the fast path (rb_str_buf_cat_byte), the string encoding must be ASCII-8BIT
// and the codepoint must be in the byte range (0x00 - 0xff).
// If either of those conditions are not met we must use the general string concat (str_buf_cat)
// function with the original codepoint argument.

let generic_concat_label = asm.new_label("generic_str_concat_codepoint");
let ret_label = asm.new_label("jit_rb_str_concat_codepoint_return");

// Check if the string is ASCII-8BIT. If it isn't, we need to use the generic string concatenation.
asm_comment!(asm, "Check if string is ASCII-8BIT");
let recv_reg = asm.load(recv);
let flags_opnd = Opnd::mem(VALUE_BITS, recv_reg, RUBY_OFFSET_RBASIC_FLAGS);
let encoding_flags_opnd = asm.and(flags_opnd, Opnd::UImm(RUBY_ENCODING_MASK as u64));
let encoding_index = asm.rshift(encoding_flags_opnd, Opnd::UImm(RUBY_ENCODING_SHIFT as u64));
asm.cmp(encoding_index, Opnd::UImm(RUBY_ENCINDEX_ASCII_8BIT as u64));
asm.jne(generic_concat_label);

// Check if the codepoint is limited to a byte value. If it isn't, we need to use the generic
// string concatenation, which will ultimately raise a `RangeError` as ASCII-8BIT strings only
// accept codepoint values that fit in a byte range.
asm_comment!(asm, "Check if codepoint is a byte value");
let codepoint_untag = asm.rshift(codepoint, Opnd::UImm(1));
asm.cmp(codepoint_untag, Opnd::Imm(0xff));
asm.jg(generic_concat_label);

// If we've made it this far, we must be appending a single byte value to a binary string.
asm_comment!(asm, "Optimized (string, codepoint) concatenation");
asm.ccall(rb_str_buf_cat_byte as *const u8, vec![recv, codepoint_untag]);
asm.jmp(ret_label);

// Either the string isn't binary or the codepoint is too large.
asm_comment!(asm, "Fallback to generic (string, codepoint) concatenation");
asm.write_label(generic_concat_label);
asm.ccall(rb_str_concat as *const u8, vec![recv, codepoint]);

asm.write_label(ret_label);

// The receiver is the return value, so we only need to pop the codepoint argument off the stack.
// We can reuse the receiver slot in the stack as the return value.
asm.stack_pop(1);

true
}

// Codegen for rb_str_concat() -- *not* String#concat
// Frequently strings are concatenated using "out_str << next_str".
// This is common in Erb and similar templating languages.
fn jit_rb_str_concat(
jit: &mut JITState,
asm: &mut Assembler,
ci: *const rb_callinfo,
cme: *const rb_callable_method_entry_t,
block: Option<BlockHandler>,
argc: i32,
known_recv_class: Option<VALUE>,
) -> bool {
// The << operator can accept integer codepoints for characters
// as the argument. We only specially optimise string arguments.
// If the peeked-at compile time argument is something other than
// a string, assume it won't be a string later either.
let comptime_arg = jit.peek_at_stack(&asm.ctx, 0);
if unsafe { RB_TYPE_P(comptime_arg, RUBY_T_FIXNUM) } {
return jit_rb_str_concat_codepoint(jit, asm, ci, cme, block, argc, known_recv_class);
}

if ! unsafe { RB_TYPE_P(comptime_arg, RUBY_T_STRING) } {
return false;
}
Expand Down Expand Up @@ -10210,7 10305,7 @@ pub fn yjit_reg_method_codegen_fns() {
}

// Register a specialized codegen function for a particular method. Note that
// the if the function returns true, the code it generates runs without a
// if the function returns true, the code it generates runs without a
// control frame and without interrupt checks. To avoid creating observable
// behavior changes, the codegen function should only target simple code paths
// that do not allocate and do not make method calls.
Expand Down
1 change: 1 addition & 0 deletions yjit/src/cruby.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 117,7 @@ extern "C" {
ci: *const rb_callinfo,
) -> *const rb_callable_method_entry_t;
pub fn rb_hash_empty_p(hash: VALUE) -> VALUE;
pub fn rb_str_concat(str1: VALUE, str2: VALUE) -> VALUE;
pub fn rb_str_setbyte(str: VALUE, index: VALUE, value: VALUE) -> VALUE;
pub fn rb_vm_splat_array(flag: VALUE, ary: VALUE) -> VALUE;
pub fn rb_vm_concat_array(ary1: VALUE, ary2st: VALUE) -> VALUE;
Expand Down
1 change: 1 addition & 0 deletions yjit/src/cruby_bindings.inc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1078,6 1078,7 @@ extern "C" {
pub fn rb_gvar_set(arg1: ID, arg2: VALUE) -> VALUE;
pub fn rb_ensure_iv_list_size(obj: VALUE, len: u32, newsize: u32);
pub fn rb_vm_barrier();
pub fn rb_str_buf_cat_byte(str_: VALUE, byte: ::std::os::raw::c_uchar);
pub fn rb_str_byte_substr(str_: VALUE, beg: VALUE, len: VALUE) -> VALUE;
pub fn rb_obj_as_string_result(str_: VALUE, obj: VALUE) -> VALUE;
pub fn rb_str_concat_literals(num: usize, strary: *const VALUE) -> VALUE;
Expand Down

0 comments on commit 00cc8e4

Please sign in to comment.