runtime/str-builtins.cpp

// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) #include "str-builtins.h" #include <cctype> #include <cwchar> #include "builtins.h" #include "formatter-utils.h" #include "formatter.h" #include "frame.h" #include "globals.h" #include "int-builtins.h" #include "objects.h" #include "runtime.h" #include "set-builtins.h" #include "slice-builtins.h" #include "thread.h" #include "tuple-builtins.h" #include "type-builtins.h" #include "unicode.h" #include "utils.h" namespace py { word adjustedStrIndex(const Str& str, word index) { word len = str.length(); if (index >= 0) { return str.offsetByCodePoints(0, index); } if (-len < index) { return Utils::maximum(0l, str.offsetByCodePoints(len, index)); } return 0; } RawObject dataArraySubstr(Thread* thread, const DataArray& data, word start, word length) { word data_len = data.length(); DCHECK_BOUND(start, data_len); DCHECK_BOUND(start, data_len - length); // SmallStr result if (length <= RawSmallStr::kMaxLength) { byte buffer[RawSmallStr::kMaxLength]; data.copyToStartAt(buffer, length, start); return SmallStr::fromBytes(View<byte>(buffer, length)); } // LargeStr result HandleScope scope(thread); MutableBytes result(&scope, thread->runtime()->newMutableBytesUninitialized(length)); result.replaceFromWithStartAt(0, *data, length, start); return result.becomeStr(); } RawObject newStrFromWideChar(Thread* thread, const wchar_t* wc_str) { return newStrFromWideCharWithLength(thread, wc_str, std::wcslen(wc_str)); } RawObject newStrFromWideCharWithLength(Thread* thread, const wchar_t* wc_str, word length) { static_assert(sizeof(*wc_str) * kBitsPerByte == 32, "only 32bit wchar_t supported."); for (word i = 0; i < length; ++i) { if (wc_str[i] < 0 || wc_str[i] > kMaxUnicode) { return thread->raiseWithFmt(LayoutId::kValueError, "character is not in range"); } } return thread->runtime()->newStrFromUTF32( View<int32_t>(reinterpret_cast<const int32_t*>(wc_str), length)); } void strCopyToWCStr(wchar_t* buf, size_t buf_length, const Str& str) { uword wchar_index = 0; for (word byte_index = 0, num_bytes = 0, byte_count = str.length(); byte_index < byte_count && wchar_index < buf_length; byte_index += num_bytes, wchar_index += 1) { int32_t cp = Str::cast(*str).codePointAt(byte_index, &num_bytes); static_assert(sizeof(*buf) == sizeof(cp), "Requires 32bit wchar_t"); buf[wchar_index] = static_cast<wchar_t>(cp); } buf[wchar_index] = '\0'; } static word strCountCharFromTo(const Str& haystack, byte needle, word start, word end) { word result = 0; if (haystack.isSmallStr()) { RawSmallStr small = SmallStr::cast(*haystack); for (word i = start; i < end; i++) { if (small.byteAt(i) == needle) result++; } return result; } byte* ptr = reinterpret_cast<byte*>(LargeStr::cast(*haystack).address()); for (word i = start; i < end; i++) { if (ptr[i] == needle) result++; } return result; } RawObject strCount(const Str& haystack, const Str& needle, word start, word end) { if (end < 0 || start < 0) { // N.B.: If end is negative we may be able to cheaply walk backward. We // should avoid calling adjustSearchIndices here since the underlying // container is not O(1) and replace it with something that preserves some // of the signals that would be useful to lower the cost of the O(n) // traversal. // TODO(T41400083): Use a different search algorithm Slice::adjustSearchIndices(&start, &end, haystack.codePointLength()); } word start_index = start == 0 ? 0 : haystack.offsetByCodePoints(0, start); if (start_index == haystack.length() && needle.length() > 0) { // Haystack is too small; fast early return return SmallInt::fromWord(0); } word end_index = end == kMaxWord ? haystack.length() : haystack.offsetByCodePoints(start_index, end - start); if ((end_index - start_index) < needle.length() || start_index > end_index) { // Haystack is too small; fast early return return SmallInt::fromWord(0); } if (needle.length() == 1) { return SmallInt::fromWord(strCountCharFromTo( haystack, SmallStr::cast(*needle).byteAt(0), start_index, end_index)); } // TODO(T41400083): Use a different search algorithm return SmallInt::fromWord(strCountSubStrFromTo(haystack, needle, start_index, end_index, haystack.length())); } word strCountSubStrFromTo(const Str& haystack, const Str& needle, word start, word end, word max_count) { DCHECK(max_count >= 0, "max_count must be non-negative"); word needle_len = needle.length(); word num_match = 0; // Loop is in byte space, not code point space for (word i = start; i <= end - needle_len && num_match < max_count;) { if (strHasPrefix(haystack, needle, i)) { i += needle_len; num_match++; continue; } i++; } return num_match; } word strCountSubStr(const Str& haystack, const Str& needle, word max_count) { return strCountSubStrFromTo(haystack, needle, 0, haystack.length(), max_count); } RawObject strEncodeASCII(Thread* thread, const Str& str) { HandleScope scope(thread); if (!str.isASCII()) { return Unbound::object(); } if (str.isSmallStr()) { return SmallStr::cast(*str).becomeBytes(); } word str_len = str.length(); MutableBytes bytes(&scope, thread->runtime()->newMutableBytesUninitialized(str_len)); bytes.replaceFromWithStr(0, *str, str_len); return bytes.becomeImmutable(); } RawObject strEscapeNonASCII(Thread* thread, const Str& str) { if (str.isASCII()) return *str; // Slow implementation using `codecs`. HandleScope scope(thread); Runtime* runtime = thread->runtime(); Symbols* symbols = runtime->symbols(); Object ascii(&scope, symbols->at(ID(ascii))); Object backslashreplace(&scope, symbols->at(ID(backslashreplace))); Bytes result_bytes( &scope, thread->invokeMethod3(str, ID(encode), ascii, backslashreplace)); word length = result_bytes.length(); MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length)); result.replaceFromWithBytes(0, *result_bytes, length); return result.becomeStr(); } RawObject strJoinWithTupleOrList(Thread* thread, const Str& sep, const Object& iterable) { Runtime* runtime = thread->runtime(); HandleScope scope(thread); Tuple tuple(&scope, runtime->emptyTuple()); word length = 0; if (iterable.isTuple()) { tuple = *iterable; length = tuple.length(); } else if (iterable.isList()) { tuple = List::cast(*iterable).items(); length = List::cast(*iterable).numItems(); } else { // Slow path: collect items into list in Python and call again return Unbound::object(); } Object elt(&scope, NoneType::object()); for (word i = 0; i < length; i++) { elt = tuple.at(i); if (!runtime->isInstanceOfStr(*elt)) { return thread->raiseWithFmt( LayoutId::kTypeError, "sequence item %w: expected str instance, %T found", i, &elt); } } return runtime->strJoin(thread, sep, tuple, length); } word strSpan(const Str& src, const Str& str) { word length = src.length(); word str_length = str.length(); word first = 0; for (; first < length; first++) { bool has_match = false; byte ch = src.byteAt(first); for (word j = 0; j < str_length; j++) { if (ch == str.byteAt(j)) { has_match = true; break; } } if (!has_match) { break; } } return first; } RawObject strSubstr(Thread* thread, const Str& str, word start, word length) { word str_len = str.length(); DCHECK_BOUND(start, str_len); DCHECK_BOUND(length, str_len - start); if (length == str_len) { return *str; } // SmallStr result if (length <= RawSmallStr::kMaxLength) { byte buffer[RawSmallStr::kMaxLength]; str.copyToStartAt(buffer, length, start); return SmallStr::fromBytes(View<byte>(buffer, length)); } // LargeStr result HandleScope scope(thread); MutableBytes result(&scope, thread->runtime()->newMutableBytesUninitialized(length)); result.replaceFromWithStartAt(0, LargeStr::cast(*str), length, start); return result.becomeStr(); } word strRSpan(const Str& src, const Str& str, word rend) { DCHECK(rend >= 0, "string index underflow"); word length = src.length(); word str_length = str.length(); word result = 0; for (word i = length - 1; i >= rend; i--, result++) { byte ch = src.byteAt(i); bool has_match = false; for (word j = 0; j < str_length; j++) { if (ch == str.byteAt(j)) { has_match = true; break; } } if (!has_match) { break; } } return result; } static bool isLineBreak(int32_t c) { switch (c) { // Common cases case '\n': case '\r': // Less common cases case '\f': case '\v': case 0x1c: case 0x1d: case 0x1e: case 0x85: case 0x2028: case 0x2029: return true; default: return false; } } RawObject strSplit(Thread* thread, const Str& str, const Str& sep, word maxsplit) { Runtime* runtime = thread->runtime(); HandleScope scope(thread); word num_splits = strCountSubStr(str, sep, maxsplit); word result_len = num_splits + 1; MutableTuple result_items(&scope, runtime->newMutableTuple(result_len)); word last_idx = 0; word sep_len = sep.length(); for (word i = 0, result_idx = 0; result_idx < num_splits;) { if (strHasPrefix(str, sep, i)) { result_items.atPut(result_idx++, strSubstr(thread, str, last_idx, i - last_idx)); i += sep_len; last_idx = i; } else { i = str.offsetByCodePoints(i, 1); } } result_items.atPut(num_splits, strSubstr(thread, str, last_idx, str.length() - last_idx)); List result(&scope, runtime->newList()); result.setItems(*result_items); result.setNumItems(result_len); return *result; } RawObject strSplitlines(Thread* thread, const Str& str, bool keepends) { HandleScope scope(thread); Runtime* runtime = thread->runtime(); List result(&scope, runtime->newList()); // Looping over code points, not bytes, but i is a byte offset for (word i = 0, j = 0; i < str.length(); j = i) { // Skip newline chars word num_bytes; while (i < str.length() && !isLineBreak(str.codePointAt(i, &num_bytes))) { i += num_bytes; } word eol_pos = i; if (i < str.length()) { int32_t cp = str.codePointAt(i, &num_bytes); word next = i + num_bytes; word next_num_bytes; // Check for \r\n specifically if (cp == '\r' && next < str.length() && str.codePointAt(next, &next_num_bytes) == '\n') { i += next_num_bytes; } i += num_bytes; if (keepends) { eol_pos = i; } } // If there are no newlines, the str returned should be identity-equal if (j == 0 && eol_pos == str.length() && str.isStr()) { runtime->listAdd(thread, result, str); return *result; } Str substr(&scope, strSubstr(thread, str, j, eol_pos - j)); runtime->listAdd(thread, result, substr); } return *result; } RawObject strStripSpace(Thread* thread, const Str& src) { word length = src.length(); if (length == 0) { return *src; } if (length == 1 && ASCII::isSpace(src.byteAt(0))) { return Str::empty(); } word first = 0; while (first < length) { word num_bytes; int32_t ch = src.codePointAt(first, &num_bytes); if (!Unicode::isSpace(ch)) { break; } first += num_bytes; } word last = length; while (last > first) { last = src.offsetByCodePoints(last, -1); word num_bytes; int32_t ch = src.codePointAt(last, &num_bytes); if (!Unicode::isSpace(ch)) { last += num_bytes; break; } } return strSubstr(thread, src, first, last - first); } RawObject strStripSpaceLeft(Thread* thread, const Str& src) { word length = src.length(); if (length == 0) { return *src; } if (length == 1 && ASCII::isSpace(src.byteAt(0))) { return Str::empty(); } word first = 0; while (first < length) { word num_bytes; int32_t ch = src.codePointAt(first, &num_bytes); if (!Unicode::isSpace(ch)) { break; } first += num_bytes; } return strSubstr(thread, src, first, length - first); } RawObject strStripSpaceRight(Thread* thread, const Str& src) { word length = src.length(); if (length == 0) { return *src; } if (length == 1 && ASCII::isSpace(src.byteAt(0))) { return Str::empty(); } word last = length; while (last > 0) { last = src.offsetByCodePoints(last, -1); word num_bytes; int32_t ch = src.codePointAt(last, &num_bytes); if (!Unicode::isSpace(ch)) { last += num_bytes; break; } } return strSubstr(thread, src, 0, last); } RawObject strStrip(Thread* thread, const Str& src, const Str& str) { word length = src.length(); if (length == 0 || str.length() == 0) { return *src; } word first = strSpan(src, str); word last = strRSpan(src, str, first); return strSubstr(thread, src, first, length - first - last); } RawObject strStripLeft(Thread* thread, const Str& src, const Str& str) { word length = src.length(); if (length == 0 || str.length() == 0) { return *src; } word first = strSpan(src, str); return strSubstr(thread, src, first, length - first); } RawObject strStripRight(Thread* thread, const Str& src, const Str& str) { word length = src.length(); if (length == 0 || str.length() == 0) { return *src; } word first = 0; word last = strRSpan(src, str, first); return strSubstr(thread, src, 0, length - last); } RawObject strTranslateASCII(Thread* thread, const Str& src, const Str& table) { if (table.length() > kMaxASCII || !table.isASCII()) { return Unbound::object(); } word src_len = src.length(); word table_len = table.length(); HandleScope scope(thread); MutableBytes result(&scope, thread->runtime()->newMutableBytesUninitialized(src_len)); // Since all non-ASCII bytes in UTF-8 have a 1 in front, we can iterate by // bytes instead of codepoints for (word i = 0; i < src.length(); i++) { byte to_translate = src.byteAt(i); if (to_translate > table_len) { result.byteAtPut(i, to_translate); } else { result.byteAtPut(i, table.byteAt(to_translate)); } } return result.becomeStr(); } RawObject strIteratorNext(Thread* thread, const StrIterator& iter) { HandleScope scope(thread); word byte_offset = iter.index(); Str underlying(&scope, iter.iterable()); if (byte_offset >= underlying.length()) { return Error::noMoreItems(); } word num_bytes = 0; word code_point = underlying.codePointAt(byte_offset, &num_bytes); iter.setIndex(byte_offset + num_bytes); return RawSmallStr::fromCodePoint(code_point); } static const BuiltinAttribute kUserStrBaseAttributes[] = { {ID(_UserStr__value), RawUserStrBase::kValueOffset, AttributeFlags::kHidden}, }; static const BuiltinAttribute kStrIteratorAttributes[] = { {ID(_str_iterator__iterable), RawStrIterator::kIterableOffset, AttributeFlags::kHidden}, {ID(_str_iterator__index), RawStrIterator::kIndexOffset, AttributeFlags::kHidden}, }; void initializeStrTypes(Thread* thread) { HandleScope scope(thread); Runtime* runtime = thread->runtime(); Type str(&scope, addBuiltinType(thread, ID(str), LayoutId::kStr, /*superclass_id=*/LayoutId::kObject, kUserStrBaseAttributes, UserStrBase::kSize, /*basetype=*/true)); { Type type(&scope, addImmediateBuiltinType(thread, ID(largestr), LayoutId::kLargeStr, /*builtin_base=*/LayoutId::kStr, /*superclass_id=*/LayoutId::kObject, /*basetype=*/false)); Layout::cast(type.instanceLayout()).setDescribedType(*str); runtime->setLargeStrType(type); } { Type type(&scope, addImmediateBuiltinType(thread, ID(smallstr), LayoutId::kSmallStr, /*builtin_base=*/LayoutId::kStr, /*superclass_id=*/LayoutId::kObject, /*basetype=*/false)); Layout::cast(type.instanceLayout()).setDescribedType(*str); runtime->setSmallStrType(type); } addBuiltinType(thread, ID(str_iterator), LayoutId::kStrIterator, /*superclass_id=*/LayoutId::kObject, kStrIteratorAttributes, StrIterator::kSize, /*basetype=*/false); } RawObject METH(str, __add__)(Thread* thread, Arguments args) { Runtime* runtime = thread->runtime(); HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Object other_obj(&scope, args.get(1)); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } if (!runtime->isInstanceOfStr(*other_obj)) { return NotImplementedType::object(); } Str self(&scope, strUnderlying(*self_obj)); Str other(&scope, strUnderlying(*other_obj)); return runtime->strConcat(thread, self, other); } RawObject METH(str, __bool__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); return Bool::fromBool(*self != Str::empty()); } RawObject METH(str, __contains__)(Thread* thread, Arguments args) { RawObject self = args.get(0); RawObject other = args.get(1); Runtime* runtime = thread->runtime(); bool is_self_invalid = true; if (runtime->isInstanceOfStr(self)) { is_self_invalid = false; if (runtime->isInstanceOfStr(other)) { return Bool::fromBool(strUnderlying(self).includes(strUnderlying(other))); } } HandleScope scope(thread); Object invalid(&scope, is_self_invalid ? self : other); return thread->raiseRequiresType(invalid, ID(str)); } RawObject METH(str, __eq__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Object other_obj(&scope, args.get(1)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } if (!thread->runtime()->isInstanceOfStr(*other_obj)) { return NotImplementedType::object(); } Str self(&scope, strUnderlying(*self_obj)); Str other(&scope, strUnderlying(*other_obj)); return Bool::fromBool(self.equals(*other)); } RawObject METH(str, __format__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Runtime* runtime = thread->runtime(); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); Object spec_obj(&scope, args.get(1)); if (!runtime->isInstanceOfStr(*spec_obj)) { return thread->raiseWithFmt(LayoutId::kTypeError, "__format__() argument 1 must be str, not %T", &spec_obj); } Str spec(&scope, strUnderlying(*spec_obj)); if (spec.length() == 0) { return *self; } FormatSpec format; Object possible_error(&scope, parseFormatSpec(thread, spec, /*default_type=*/'s', /*default_align=*/'<', &format)); if (!possible_error.isNoneType()) { DCHECK(possible_error.isErrorException(), "expected exception"); return *possible_error; } if (format.type != 's') { return raiseUnknownFormatError(thread, format.type, self_obj); } if (format.positive_sign != '\0') { return thread->raiseWithFmt(LayoutId::kValueError, "Sign not allowed in string format specifier"); } if (format.alternate) { return thread->raiseWithFmt( LayoutId::kValueError, "Alternate form (#) not allowed in string format specifier"); } if (format.alignment == '=') { return thread->raiseWithFmt( LayoutId::kValueError, "'=' alignment not allowed in string format specifier"); } return formatStr(thread, self, &format); } RawObject METH(str, __ge__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Object other_obj(&scope, args.get(1)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } if (!thread->runtime()->isInstanceOfStr(*other_obj)) { return NotImplementedType::object(); } Str self(&scope, strUnderlying(*self_obj)); Str other(&scope, strUnderlying(*other_obj)); return Bool::fromBool(self.compare(*other) >= 0); } bool METH(str, __getitem___intrinsic)(Thread* thread) { RawObject arg0 = thread->stackPeek(1); if (!arg0.isStr()) { return false; } RawObject arg1 = thread->stackPeek(0); word idx; if (arg1.isSmallInt()) { idx = RawSmallInt::cast(arg1).value(); } else if (arg1.isBool()) { idx = RawBool::cast(arg1).value(); } else { word start, stop; if (!tryUnpackSlice(arg1, &start, &stop)) { return false; } { // Manually adjust slice bounds to avoid an extra call to codePointLength HandleScope scope(thread); Str self(&scope, arg0); word start_index = adjustedStrIndex(self, start); word stop_index = adjustedStrIndex(self, stop); word length = stop_index - start_index; thread->stackDrop(2); thread->stackSetTop(length <= 0 ? Str::empty() : strSubstr(thread, self, start_index, length)); } return true; } RawStr self = Str::cast(arg0); word len = self.length(); if (0 <= idx && idx < len) { word offset = self.offsetByCodePoints(0, idx); if (offset < len) { word ignored; thread->stackDrop(2); thread->stackSetTop( RawSmallStr::fromCodePoint(self.codePointAt(offset, &ignored))); return true; } } if (0 > idx) { word offset = self.offsetByCodePoints(len, idx); if (offset < len && offset != -1) { word ignored; thread->stackDrop(2); thread->stackSetTop( RawSmallStr::fromCodePoint(self.codePointAt(offset, &ignored))); return true; } } return false; } RawObject METH(str, __gt__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Object other_obj(&scope, args.get(1)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } if (!thread->runtime()->isInstanceOfStr(*other_obj)) { return NotImplementedType::object(); } Str self(&scope, strUnderlying(*self_obj)); Str other(&scope, strUnderlying(*other_obj)); return Bool::fromBool(self.compare(*other) > 0); } RawObject METH(str, __hash__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self(&scope, args.get(0)); Runtime* runtime = thread->runtime(); if (!runtime->isInstanceOfStr(*self)) { return thread->raiseRequiresType(self, ID(str)); } Str self_str(&scope, strUnderlying(*self)); return SmallInt::fromWord(strHash(thread, *self_str)); } void strInternInTuple(Thread* thread, const Object& items) { HandleScope scope(thread); Runtime* runtime = thread->runtime(); DCHECK(runtime->isInstanceOfTuple(*items), "items must be a tuple instance"); Tuple tuple(&scope, tupleUnderlying(*items)); Str str(&scope, Str::empty()); Object result(&scope, NoneType::object()); for (word i = 0; i < tuple.length(); i++) { str = tuple.at(i); result = Runtime::internStr(thread, str); if (result.isError()) continue; if (result != str) { tuple.atPut(i, *result); } } } static bool allNameChars(const Str& str) { for (word i = 0; i < str.length(); i++) { byte c = str.byteAt(i); if (!ASCII::isAlnum(c) && c != '_') { return false; } } return true; } static bool strInternConstant(Thread* thread, Object* object) { if (object->isStr()) { HandleScope scope(thread); Str str(&scope, **object); if (allNameChars(str)) { str = Runtime::internStr(thread, str); if (str != *object) { *object = *str; return true; } } return false; } if (object->isTuple()) { HandleScope scope(thread); Tuple tuple(&scope, **object); Object item(&scope, NoneType::object()); bool modified = false; for (word i = 0, length = tuple.length(); i < length; i++) { item = tuple.at(i); if (strInternConstant(thread, &item)) { modified = true; tuple.atPut(i, *item); } } if (!modified) return false; *object = *tuple; return true; } if (object->isFrozenSet()) { HandleScope scope(thread); FrozenSet set(&scope, **object); FrozenSet new_set(&scope, thread->runtime()->newFrozenSet()); Object value(&scope, NoneType::object()); Object hash(&scope, NoneType::object()); bool modified = false; for (word j = 0, idx = 0; setNextItem(set, &idx, &value); j++) { if (strInternConstant(thread, &value)) { modified = true; } hash = Interpreter::hash(thread, value); if (hash.isErrorException()) return false; setAdd(thread, new_set, value, SmallInt::cast(*hash).value()); } if (!modified) return false; *object = *new_set; return true; } return false; } bool strInternConstants(Thread* thread, const Object& items) { HandleScope scope(thread); DCHECK(thread->runtime()->isInstanceOfTuple(*items), "items must be a tuple instance"); Object tuple(&scope, tupleUnderlying(*items)); return strInternConstant(thread, &tuple); } word strFind(const Str& haystack, const Str& needle) { word haystack_len = haystack.length(); word needle_len = needle.length(); if (needle_len > haystack_len) { return -1; } if (needle_len == 0) { return 0; } if (needle_len == 1 && haystack.isASCII()) { return strFindAsciiChar(haystack, needle.byteAt(0)); } // Loop is in byte space, not code point space word result = 0; // TODO(T41400083): Use a different search algorithm for (word i = 0; i <= haystack_len - needle_len; result++) { if (strHasPrefix(haystack, needle, i)) { return result; } i = haystack.offsetByCodePoints(i, 1); } return -1; } word strFindWithRange(const Str& haystack, const Str& needle, word start, word end) { if (end < 0 || start < 0) { Slice::adjustSearchIndices(&start, &end, haystack.codePointLength()); } word start_index = haystack.offsetByCodePoints(0, start); if (start_index == haystack.length() && needle.length() > 0) { // Haystack is too small; fast early return return -1; } word end_index = haystack.offsetByCodePoints(start_index, end - start); if ((end_index - start_index) < needle.length() || start_index > end_index) { // Haystack is too small; fast early return return -1; } // Loop is in byte space, not code point space word result = start; // TODO(T41400083): Use a different search algorithm for (word i = start_index; i <= end_index - needle.length(); result++) { bool has_match = strHasPrefix(haystack, needle, i); word next = haystack.offsetByCodePoints(i, 1); if (i == next) { // We've reached a fixpoint; offsetByCodePoints will not advance past the // length of the string. if (start_index >= i) { // The start is greater than the length of the string. return -1; } // If the start is within bounds, just return the last found index. break; } if (has_match) { return result; } i = next; } return -1; } word strFindAsciiChar(const Str& haystack, byte needle) { DCHECK(needle <= kMaxASCII, "must only be called for ASCII `needle`"); for (word i = 0, length = haystack.length(); i < length; i++) { if (haystack.byteAt(i) == needle) { return i; } } return -1; } word strFindFirstNonWhitespace(const Str& str) { word i = 0; for (word codepoint_len, length = str.length(); i < length; i += codepoint_len) { if (!Unicode::isSpace(str.codePointAt(i, &codepoint_len))) return i; } return i; } bool strHasPrefix(const Str& str, const Str& prefix, word start) { word str_len = str.length(); word prefix_len = prefix.length(); if (str_len - start < prefix_len) { return false; } for (word i = 0; i < prefix_len; i++) { if (str.byteAt(start + i) != prefix.byteAt(i)) { return false; } } return true; } bool strHasSurrogate(const Str& str) { return str.includesByte(UTF8::kSurrogateLeadByte); } word strRFind(const Str& haystack, const Str& needle, word start, word end) { // Haystack slice is empty; fast early return if (start > end) return -1; // Needle is empty if (needle == Str::empty()) return end; word start_index = haystack.offsetByCodePoints(0, start); if (start_index == haystack.length()) { // Haystack is too small; fast early return return -1; } word end_index = haystack.offsetByCodePoints(start_index, end - start); if ((end_index - start_index) < needle.length() || start_index > end_index) { // Haystack is too small; fast early return return -1; } // Loop is in byte space, not code point space // Invariant: cp_offset and byte_offset describe the same offset into the // string, but one is in code point space and the other is in byte space // TODO(T41400083): Use a different search algorithm for (word cp_offset = end - 1, byte_offset = haystack.offsetByCodePoints(end_index, -1); byte_offset >= 0; cp_offset--, byte_offset = haystack.offsetByCodePoints(byte_offset, -1)) { if (strHasPrefix(haystack, needle, byte_offset)) { return cp_offset; } } return -1; } word strRFindAsciiChar(const Str& haystack, byte needle) { DCHECK(needle <= kMaxASCII, "must only be called for ASCII `needle`"); for (word i = haystack.length() - 1; i >= 0; i--) { if (haystack.byteAt(i) == needle) { return i; } } return -1; } RawObject METH(str, __le__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Object other_obj(&scope, args.get(1)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } if (!thread->runtime()->isInstanceOfStr(*other_obj)) { return NotImplementedType::object(); } Str self(&scope, strUnderlying(*self_obj)); Str other(&scope, strUnderlying(*other_obj)); return Bool::fromBool(self.compare(*other) <= 0); } RawObject METH(str, __len__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); return SmallInt::fromWord(self.codePointLength()); } static RawObject strLowerASCII(Thread* thread, Object& str_obj, Str& str, word length) { if (str.isSmallStr()) { byte buf[SmallStr::kMaxLength]; for (word i = 0; i < length; i++) { buf[i] = ASCII::toLower(str.byteAt(i)); } return SmallStr::fromBytes({buf, length}); } // Search for the first uppercase character. word first_uppercase = 0; for (; first_uppercase < length; first_uppercase++) { if (ASCII::isUpper(str.byteAt(first_uppercase))) { break; } } if (first_uppercase >= length && str_obj.isStr()) { return *str_obj; } HandleScope scope(thread); Runtime* runtime = thread->runtime(); MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length)); result.replaceFromWithStr(0, *str, first_uppercase); for (word i = first_uppercase; i < length; i++) { byte lower = ASCII::toLower(str.byteAt(i)); result.byteAtPut(i, lower); } return result.becomeStr(); } RawObject METH(str, casefold)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Runtime* runtime = thread->runtime(); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word length = self.length(); if (self.isASCII()) { return strLowerASCII(thread, self_obj, self, length); } // Search for the first uppercase character. word first_uppercase = 0; for (word char_length; first_uppercase < length; first_uppercase += char_length) { int32_t code_point = self.codePointAt(first_uppercase, &char_length); if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point) || Unicode::isUnfolded(code_point)) { break; } } if (first_uppercase >= length && self_obj.isStr()) { return *self_obj; } StrArray result(&scope, runtime->newStrArray()); runtime->strArrayEnsureCapacity(thread, result, length); // Since the prefix is valid UTF-8 and guaranteed to fit, it's safe to write // directly to the underlying MutableBytes. MutableBytes result_bytes(&scope, result.items()); result_bytes.replaceFromWithStr(0, *self, first_uppercase); result.setNumItems(first_uppercase); for (word char_length, i = first_uppercase; i < length; i += char_length) { struct FullCasing casefold = Unicode::toFolded(self.codePointAt(i, &char_length)); for (word j = 0; j < 3; j++) { int32_t decoded = casefold.code_points[j]; if (decoded == -1) break; runtime->strArrayAddCodePoint(thread, result, decoded); } } return runtime->strFromStrArray(result); } RawObject METH(str, lower)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Runtime* runtime = thread->runtime(); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word length = self.length(); if (self.isASCII()) { return strLowerASCII(thread, self_obj, self, length); } // Search for the first uppercase character. word first_uppercase = 0; for (word char_length; first_uppercase < length; first_uppercase += char_length) { int32_t code_point = self.codePointAt(first_uppercase, &char_length); if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point)) { break; } } if (first_uppercase >= length && self_obj.isStr()) { return *self_obj; } StrArray result(&scope, runtime->newStrArray()); runtime->strArrayEnsureCapacity(thread, result, length); // Since the prefix is valid UTF-8 and guaranteed to fit, it's safe to write // directly to the underlying MutableBytes. MutableBytes result_bytes(&scope, result.items()); result_bytes.replaceFromWithStr(0, *self, first_uppercase); result.setNumItems(first_uppercase); for (word char_length, i = first_uppercase; i < length; i += char_length) { struct FullCasing lower = Unicode::toLower(self.codePointAt(i, &char_length)); for (word j = 0; j < 3; j++) { int32_t decoded = lower.code_points[j]; if (decoded == -1) break; runtime->strArrayAddCodePoint(thread, result, decoded); } } return runtime->strFromStrArray(result); } RawObject METH(str, title)(Thread* thread, Arguments args) { Runtime* runtime = thread->runtime(); HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); bool previous_is_cased = false; StrArray result(&scope, runtime->newStrArray()); for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); FullCasing mapped = previous_is_cased ? Unicode::toLower(code_point) : Unicode::toTitle(code_point); for (word j = 0; mapped.code_points[j] != -1; j++) { runtime->strArrayAddCodePoint(thread, result, mapped.code_points[j]); } previous_is_cased = Unicode::isCased(code_point); } return runtime->strFromStrArray(result); } RawObject METH(str, upper)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Runtime* runtime = thread->runtime(); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word length = self.length(); if (self.isASCII()) { if (self.isSmallStr()) { byte buf[SmallStr::kMaxLength]; for (word i = 0; i < length; i++) { buf[i] = ASCII::toUpper(self.byteAt(i)); } return SmallStr::fromBytes({buf, length}); } // Search for the first lowercase character. word first_lowercase = 0; for (; first_lowercase < length; first_lowercase++) { if (ASCII::isLower(self.byteAt(first_lowercase))) { break; } } if (first_lowercase >= length && self_obj.isStr()) { return *self_obj; } MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length)); result.replaceFromWithStr(0, *self, first_lowercase); for (word i = first_lowercase; i < length; i++) { byte lower = ASCII::toUpper(self.byteAt(i)); result.byteAtPut(i, lower); } return result.becomeStr(); } // Search for the first lowercase character. word first_lowercase = 0; for (word char_length; first_lowercase < length; first_lowercase += char_length) { int32_t code_point = self.codePointAt(first_lowercase, &char_length); if (Unicode::isLower(code_point) || Unicode::isTitle(code_point)) { break; } } if (first_lowercase >= length && self_obj.isStr()) { return *self_obj; } StrArray result(&scope, runtime->newStrArray()); runtime->strArrayEnsureCapacity(thread, result, length); // Since the prefix is valid UTF-8 and guaranteed to fit, it's safe to write // directly to the underlying MutableBytes. MutableBytes result_bytes(&scope, result.items()); result_bytes.replaceFromWithStr(0, *self, first_lowercase); result.setNumItems(first_lowercase); for (word char_length, i = first_lowercase; i < length; i += char_length) { struct FullCasing upper = Unicode::toUpper(self.codePointAt(i, &char_length)); for (word j = 0; j < 3; j++) { int32_t decoded = upper.code_points[j]; if (decoded == -1) break; runtime->strArrayAddCodePoint(thread, result, decoded); } } return runtime->strFromStrArray(result); } RawObject METH(str, __lt__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Object other_obj(&scope, args.get(1)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } if (!thread->runtime()->isInstanceOfStr(*other_obj)) { return NotImplementedType::object(); } Str self(&scope, strUnderlying(*self_obj)); Str other(&scope, strUnderlying(*other_obj)); return Bool::fromBool(self.compare(*other) < 0); } RawObject METH(str, __mul__)(Thread* thread, Arguments args) { Runtime* runtime = thread->runtime(); HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Object count_index(&scope, args.get(1)); Object count_obj(&scope, intFromIndex(thread, count_index)); if (count_obj.isError()) return *count_obj; word count = intUnderlying(*count_obj).asWordSaturated(); if (!SmallInt::isValid(count)) { return thread->raiseWithFmt(LayoutId::kOverflowError, "cannot fit '%T' into an index-sized integer", &count_obj); } Str self(&scope, *self_obj); word length = self.length(); if (count <= 0 || length == 0) { return Str::empty(); } word new_length; if (__builtin_mul_overflow(length, count, &new_length) || !SmallInt::isValid(new_length)) { return thread->raiseWithFmt(LayoutId::kOverflowError, "repeated string is too long"); } return runtime->strRepeat(thread, self, count); } RawObject METH(str, __ne__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Object other_obj(&scope, args.get(1)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } if (!thread->runtime()->isInstanceOfStr(*other_obj)) { return NotImplementedType::object(); } Str self(&scope, strUnderlying(*self_obj)); Str other(&scope, strUnderlying(*other_obj)); return Bool::fromBool(!self.equals(*other)); } RawObject METH(str, __iter__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); Runtime* runtime = thread->runtime(); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); return runtime->newStrIterator(self); } RawObject METH(str, __repr__)(Thread* thread, Arguments args) { Runtime* runtime = thread->runtime(); HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); const word self_len = self.length(); word result_len = 0; word squote = 0; word dquote = 0; // Precompute the size so that only one allocation is necessary. for (word i = 0, char_len; i < self_len; i += char_len) { int32_t code_point = self.codePointAt(i, &char_len); if (code_point == '\'') { squote++; result_len += 1; } else if (code_point == '"') { dquote++; result_len += 1; } else if (code_point == '\\' || code_point == '\t' || code_point == '\r' || code_point == '\n') { result_len += 2; } else if (Unicode::isPrintable(code_point)) { result_len += char_len; } else if (code_point < 0x100) { result_len += 4; } else if (code_point < 0x10000) { result_len += 6; } else { result_len += 10; } } byte quote = '\''; bool unchanged = (result_len == self_len); if (squote > 0) { unchanged = false; // If there are both single quotes and double quotes, the outer quote will // be singles, and all internal quotes will need to be escaped. if (dquote > 0) { // Add the size of the escape backslashes on the single quotes. result_len += squote; } else { quote = '"'; } } result_len += 2; // quotes MutableBytes buf(&scope, runtime->newMutableBytesUninitialized(result_len)); buf.byteAtPut(0, quote); buf.byteAtPut(result_len - 1, quote); if (unchanged) { // Remaining characters were unmodified, so copy them directly. buf.replaceFromWithStr(1, *self, self_len); return buf.becomeStr(); } word out = 1; for (word in = 0, char_len; in < self_len; in += char_len) { int32_t code_point = self.codePointAt(in, &char_len); if (code_point == quote) { buf.byteAtPut(out++, '\\'); buf.byteAtPut(out++, quote); } else if (code_point == '\\') { buf.byteAtPut(out++, '\\'); buf.byteAtPut(out++, '\\'); } else if (code_point == '\t') { buf.byteAtPut(out++, '\\'); buf.byteAtPut(out++, 't'); } else if (code_point == '\r') { buf.byteAtPut(out++, '\\'); buf.byteAtPut(out++, 'r'); } else if (code_point == '\n') { buf.byteAtPut(out++, '\\'); buf.byteAtPut(out++, 'n'); } else if (' ' <= code_point && code_point < kMaxASCII) { buf.byteAtPut(out++, code_point); } else if (code_point <= kMaxASCII) { buf.byteAtPut(out++, '\\'); buf.byteAtPut(out++, 'x'); uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/2, code_point); out += 2; } else if (Unicode::isPrintable(code_point)) { for (word i = 0; i < char_len; i++) { buf.byteAtPut(out + i, self.byteAt(in + i)); } out += char_len; } else if (code_point <= 0xff) { buf.byteAtPut(out++, '\\'); buf.byteAtPut(out++, 'x'); uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/2, code_point); out += 2; } else if (code_point <= 0xffff) { buf.byteAtPut(out++, '\\'); buf.byteAtPut(out++, 'u'); uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/4, code_point); out += 4; } else { buf.byteAtPut(out++, '\\'); buf.byteAtPut(out++, 'U'); uwordToHexadecimalWithMutableBytes(*buf, /*index=*/out, /*num_digits=*/8, code_point); out += 8; } } DCHECK(out == result_len - 1, "wrote %ld characters, expected %ld", out - 1, result_len - 2); return buf.becomeStr(); } bool METH(str, _mod_convert_number_int_intrinsic)(Thread* thread) { RawObject arg = thread->stackTop(); if (arg.isBool()) { thread->stackDrop(2); thread->stackSetTop(convertBoolToInt(arg)); return true; } if (arg.isInt()) { thread->stackDrop(2); thread->stackSetTop(arg); return true; } return false; } bool METH(str, _mod_convert_number_index_intrinsic)(Thread* thread) { RawObject arg = thread->stackTop(); if (arg.isBool()) { thread->stackDrop(2); thread->stackSetTop(convertBoolToInt(arg)); return true; } if (arg.isInt()) { thread->stackDrop(2); thread->stackSetTop(arg); return true; } return false; } bool METH(str, _mod_check_single_arg_intrinsic)(Thread* thread) { Runtime* runtime = thread->runtime(); RawObject arg = thread->stackTop(); if (runtime->isInstanceOfTuple(arg)) { RawTuple arg_tuple = tupleUnderlying(arg); if (arg_tuple.length() != 1) return false; thread->stackDrop(2); thread->stackSetTop(arg_tuple); return true; } RawMutableTuple result = MutableTuple::cast(runtime->newMutableTuple(1)); // Note that we need to re-fetch stackTop() since newMutableTuple() may // have triggered GC. result.atPut(0, thread->stackTop()); thread->stackDrop(2); thread->stackSetTop(result.becomeImmutable()); return true; } RawObject METH(str, isalnum)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); if (char_length == 0) { return Bool::falseObj(); } for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (!Unicode::isAlnum(code_point)) { return Bool::falseObj(); } } return Bool::trueObj(); } RawObject METH(str, isalpha)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); if (char_length == 0) { return Bool::falseObj(); } for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (!Unicode::isAlpha(code_point)) { return Bool::falseObj(); } } return Bool::trueObj(); } RawObject METH(str, isascii)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); return Bool::fromBool(self.isASCII()); } RawObject METH(str, isdecimal)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); if (char_length == 0) { return Bool::falseObj(); } for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (!Unicode::isDecimal(code_point)) { return Bool::falseObj(); } } return Bool::trueObj(); } RawObject METH(str, isdigit)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); if (char_length == 0) { return Bool::falseObj(); } for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (!Unicode::isDigit(code_point)) { return Bool::falseObj(); } } return Bool::trueObj(); } bool strIsIdentifier(const Str& str) { word char_length = str.length(); if (char_length == 0) { return false; } word len; int32_t first = str.codePointAt(0, &len); if (!Unicode::isXidStart(first) && first != '_') { return false; } for (word i = len; i < char_length; i += len) { int32_t code_point = str.codePointAt(i, &len); if (!Unicode::isXidContinue(code_point)) { return false; } } return true; } RawObject METH(str, isidentifier)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); return Bool::fromBool(strIsIdentifier(self)); } RawObject METH(str, islower)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); bool cased = false; for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point)) { return Bool::falseObj(); } if (!cased && Unicode::isLower(code_point)) { cased = true; } } return Bool::fromBool(cased); } RawObject METH(str, isnumeric)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); if (char_length == 0) { return Bool::falseObj(); } for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (!Unicode::isNumeric(code_point)) { return Bool::falseObj(); } } return Bool::trueObj(); } RawObject METH(str, isprintable)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (!Unicode::isPrintable(code_point)) { return Bool::falseObj(); } } return Bool::trueObj(); } RawObject METH(str, isspace)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); if (char_length == 0) { return Bool::falseObj(); } if (char_length == 1) { return ASCII::isSpace(self.byteAt(0)) ? Bool::trueObj() : Bool::falseObj(); } word byte_index = 0; do { word num_bytes; int32_t codepoint = self.codePointAt(byte_index, &num_bytes); if (!Unicode::isSpace(codepoint)) { return Bool::falseObj(); } byte_index += num_bytes; } while (byte_index < char_length); return Bool::trueObj(); } RawObject METH(str, istitle)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); bool cased = false; bool previous_is_cased = false; word char_length = self.length(); for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (Unicode::isUpper(code_point) || Unicode::isTitle(code_point)) { if (previous_is_cased) return Bool::falseObj(); cased = true; previous_is_cased = true; } else if (Unicode::isLower(code_point)) { if (!previous_is_cased) return Bool::falseObj(); previous_is_cased = true; cased = true; } else { previous_is_cased = false; } } return Bool::fromBool(cased); } RawObject METH(str, isupper)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!thread->runtime()->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); bool cased = false; for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (Unicode::isLower(code_point) || Unicode::isTitle(code_point)) { return Bool::falseObj(); } if (!cased && Unicode::isUpper(code_point)) { cased = true; } } return Bool::fromBool(cased); } RawObject METH(str, lstrip)(Thread* thread, Arguments args) { Runtime* runtime = thread->runtime(); HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str str(&scope, strUnderlying(*self_obj)); Object other_obj(&scope, args.get(1)); if (other_obj.isNoneType()) { return strStripSpaceLeft(thread, str); } if (!runtime->isInstanceOfStr(*other_obj)) { return thread->raiseWithFmt(LayoutId::kTypeError, "str.lstrip() arg must be None or str"); } Str chars(&scope, strUnderlying(*other_obj)); return strStripLeft(thread, str, chars); } RawObject METH(str, rstrip)(Thread* thread, Arguments args) { Runtime* runtime = thread->runtime(); HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str str(&scope, strUnderlying(*self_obj)); Object other_obj(&scope, args.get(1)); if (other_obj.isNoneType()) { return strStripSpaceRight(thread, str); } if (!runtime->isInstanceOfStr(*other_obj)) { return thread->raiseWithFmt(LayoutId::kTypeError, "str.rstrip() arg must be None or str"); } Str chars(&scope, strUnderlying(*other_obj)); return strStripRight(thread, str, chars); } RawObject METH(str, strip)(Thread* thread, Arguments args) { Runtime* runtime = thread->runtime(); HandleScope scope(thread); Object self_obj(&scope, args.get(0)); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str str(&scope, strUnderlying(*self_obj)); Object other_obj(&scope, args.get(1)); if (other_obj.isNoneType()) { return strStripSpace(thread, str); } if (!runtime->isInstanceOfStr(*other_obj)) { return thread->raiseWithFmt(LayoutId::kTypeError, "str.strip() arg must be None or str"); } Str chars(&scope, strUnderlying(*other_obj)); return strStrip(thread, str, chars); } static int32_t handleCapitalSigma(const Str& str, word i) { bool final_sigma = false; for (word j = str.offsetByCodePoints(i, -1), len; j >= 0; j = str.offsetByCodePoints(j, -1)) { int32_t code_point = str.codePointAt(j, &len); if (!Unicode::isCaseIgnorable(code_point)) { final_sigma = Unicode::isCased(code_point); break; } } if (!final_sigma) { return 0x03C3; } word char_length = str.length(); for (word j = str.offsetByCodePoints(i, 1), len; j < char_length; j += len) { int32_t code_point = str.codePointAt(j, &len); if (!Unicode::isCaseIgnorable(code_point)) { return Unicode::isCased(code_point) ? 0x03C3 : 0x03C2; } } return 0x03C2; } RawObject METH(str, swapcase)(Thread* thread, Arguments args) { HandleScope scope(thread); Runtime* runtime = thread->runtime(); Object self_obj(&scope, args.get(0)); if (!runtime->isInstanceOfStr(*self_obj)) { return thread->raiseRequiresType(self_obj, ID(str)); } Str self(&scope, strUnderlying(*self_obj)); word char_length = self.length(); // Most of the time, this will be sufficient. However, due to Unicode casing, // it's possible that we could need up to 3 times as much space as the input. StrArray result(&scope, runtime->newStrArray()); runtime->strArrayEnsureCapacity(thread, result, char_length); for (word i = 0, len; i < char_length; i += len) { int32_t code_point = self.codePointAt(i, &len); if (Unicode::isUpper(code_point)) { if (code_point == 0x03a3) { runtime->strArrayAddCodePoint(thread, result, handleCapitalSigma(self, i)); } else { FullCasing lower = Unicode::toLower(code_point); for (word j = 0; j < 3; j++) { int32_t decoded = lower.code_points[j]; if (decoded == -1) break; runtime->strArrayAddCodePoint(thread, result, decoded); } } } else { FullCasing upper = Unicode::toUpper(code_point); for (word j = 0; j < 3; j++) { int32_t decoded = upper.code_points[j]; if (decoded == -1) break; runtime->strArrayAddCodePoint(thread, result, decoded); } } } return runtime->strFromStrArray(result); } RawObject METH(str_iterator, __iter__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self(&scope, args.get(0)); if (!self.isStrIterator()) { return thread->raiseRequiresType(self, ID(str_iterator)); } return *self; } // TODO(T35578204) Implement this for UTF-8. This probably means keeping extra // state and logic so that __next__() will advance to the next codepoint. RawObject METH(str_iterator, __next__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self(&scope, args.get(0)); if (!self.isStrIterator()) { return thread->raiseRequiresType(self, ID(str_iterator)); } StrIterator iter(&scope, *self); Object value(&scope, strIteratorNext(thread, iter)); if (value.isError()) { return thread->raise(LayoutId::kStopIteration, NoneType::object()); } return *value; } RawObject METH(str_iterator, __length_hint__)(Thread* thread, Arguments args) { HandleScope scope(thread); Object self(&scope, args.get(0)); if (!self.isStrIterator()) { return thread->raiseRequiresType(self, ID(str_iterator)); } StrIterator str_iterator(&scope, *self); Str str(&scope, str_iterator.iterable()); return SmallInt::fromWord(str.length() - str_iterator.index()); } } // namespace py

runtime/str-builtins.cpp (1,680 lines of code) (raw):