ext/Objects/unicodeobject.cpp (2,270 lines of code) (raw):
// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
// unicodeobject.c implementation
#include <cerrno>
#include <cstdarg>
#include <cstring>
#include <cwchar>
#include "cpython-data.h"
#include "cpython-func.h"
#include "api-handle.h"
#include "bytearray-builtins.h"
#include "bytes-builtins.h"
#include "handles.h"
#include "modules.h"
#include "objects.h"
#include "runtime.h"
#include "str-builtins.h"
#include "unicode.h"
#include "utils.h"
const char* Py_FileSystemDefaultEncoding = "utf-8";
int Py_HasFileSystemDefaultEncoding = 1;
const char* Py_FileSystemDefaultEncodeErrors = "surrogatepass";
namespace py {
typedef byte Py_UCS1;
typedef uint16_t Py_UCS2;
static const int kMaxLongLongChars = 19; // len(str(2**63-1))
static const int kOverallocateFactor = 4;
PY_EXPORT PyTypeObject* PyUnicodeIter_Type_Ptr() {
Runtime* runtime = Thread::current()->runtime();
return reinterpret_cast<PyTypeObject*>(ApiHandle::borrowedReference(
runtime, runtime->typeAt(LayoutId::kStrIterator)));
}
static RawObject symbolFromError(Thread* thread, const char* error) {
Runtime* runtime = thread->runtime();
Symbols* symbols = runtime->symbols();
if (error == nullptr || std::strcmp(error, "strict") == 0) {
return symbols->at(ID(strict));
}
if (std::strcmp(error, "ignore") == 0) {
return symbols->at(ID(ignore));
}
if (std::strcmp(error, "replace") == 0) {
return symbols->at(ID(replace));
}
return Runtime::internStrFromCStr(thread, error);
}
PY_EXPORT void PyUnicode_WRITE_Func(enum PyUnicode_Kind kind, void* data,
Py_ssize_t index, Py_UCS4 value) {
if (kind == PyUnicode_1BYTE_KIND) {
static_cast<Py_UCS1*>(data)[index] = static_cast<Py_UCS1>(value);
} else if (kind == PyUnicode_2BYTE_KIND) {
static_cast<Py_UCS2*>(data)[index] = static_cast<Py_UCS2>(value);
} else {
DCHECK(kind == PyUnicode_4BYTE_KIND, "kind must be PyUnicode_4BYTE_KIND");
static_cast<Py_UCS4*>(data)[index] = static_cast<Py_UCS4>(value);
}
}
PY_EXPORT void _PyUnicodeWriter_Dealloc(_PyUnicodeWriter* writer) {
PyMem_Free(writer->data);
}
PY_EXPORT PyObject* _PyUnicodeWriter_Finish(_PyUnicodeWriter* writer) {
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Str str(&scope, runtime->newStrFromUTF32(View<int32_t>(
static_cast<int32_t*>(writer->data), writer->pos)));
PyMem_Free(writer->data);
return ApiHandle::newReference(runtime, *str);
}
PY_EXPORT void _PyUnicodeWriter_Init(_PyUnicodeWriter* writer) {
std::memset(writer, 0, sizeof(*writer));
writer->kind = PyUnicode_4BYTE_KIND;
}
static int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter* writer,
Py_ssize_t length,
Py_UCS4 /* maxchar */) {
writer->maxchar = kMaxUnicode;
if (length > kMaxWord - writer->pos) {
Thread::current()->raiseMemoryError();
return -1;
}
Py_ssize_t newlen = writer->pos + length;
if (writer->data == nullptr) {
if (writer->overallocate &&
newlen <= (kMaxWord - newlen / kOverallocateFactor)) {
// overallocate to limit the number of realloc()
newlen += newlen / kOverallocateFactor;
}
writer->data = PyMem_Malloc(newlen * sizeof(int32_t));
if (writer->data == nullptr) return -1;
} else if (newlen > writer->size) {
if (writer->overallocate &&
newlen <= (kMaxWord - newlen / kOverallocateFactor)) {
// overallocate to limit the number of realloc()
newlen += newlen / kOverallocateFactor;
}
writer->data = PyMem_Realloc(writer->data, newlen * sizeof(int32_t));
if (writer->data == nullptr) return -1;
}
writer->size = newlen;
return 0;
}
PY_EXPORT int _PyUnicodeWriter_Prepare(_PyUnicodeWriter* writer,
Py_ssize_t length, Py_UCS4 maxchar) {
if (length <= writer->size - writer->pos || length == 0) return 0;
return _PyUnicodeWriter_PrepareInternal(writer, length, maxchar);
}
PY_EXPORT int _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter* writer,
const char* ascii,
Py_ssize_t len) {
if (len == -1) len = std::strlen(ascii);
if (writer->data == nullptr && !writer->overallocate) {
writer->data = PyMem_Malloc(len * sizeof(int32_t));
writer->size = len;
}
if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) == -1) return -1;
Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
for (Py_ssize_t i = 0; i < len; ++i) {
CHECK(ascii[i] >= 0, "_PyUnicodeWriter_WriteASCIIString only takes ASCII");
data[writer->pos++] = static_cast<uint8_t>(ascii[i]);
}
return 0;
}
PY_EXPORT int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter* writer,
Py_UCS4 ch) {
if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) return -1;
PyUnicode_WRITE(PyUnicode_4BYTE_KIND, writer->data, writer->pos, ch);
writer->pos++;
return 0;
}
PY_EXPORT int _PyUnicodeWriter_WriteChar(_PyUnicodeWriter* writer, Py_UCS4 ch) {
return _PyUnicodeWriter_WriteCharInline(writer, ch);
}
PY_EXPORT int _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter* writer,
const char* str,
Py_ssize_t len) {
if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) == -1) return -1;
Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
for (Py_ssize_t i = 0; i < len; ++i) {
data[writer->pos++] = static_cast<uint8_t>(str[i]);
}
return 0;
}
PY_EXPORT int _PyUnicodeWriter_WriteStr(_PyUnicodeWriter* writer,
PyObject* str) {
Thread* thread = Thread::current();
HandleScope scope(thread);
Object obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Str src(&scope, strUnderlying(*obj));
Py_ssize_t codepoints = src.codePointLength();
if (_PyUnicodeWriter_Prepare(writer, codepoints, kMaxUnicode) == -1) {
return -1;
}
Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
for (word i = 0, len = src.length(), cp_len; i < len; i += cp_len) {
int32_t cp = src.codePointAt(i, &cp_len);
data[writer->pos++] = cp;
}
return 0;
}
PY_EXPORT int _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter* writer,
PyObject* str, Py_ssize_t start,
Py_ssize_t end) {
if (end == 0) return 0;
Py_ssize_t len = end - start;
if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) < 0) return -1;
Thread* thread = Thread::current();
HandleScope scope(thread);
Object obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Str src(&scope, strUnderlying(*obj));
word start_index = thread->strOffset(src, start);
DCHECK_BOUND(start_index, src.length());
word end_index = thread->strOffset(src, end);
DCHECK_BOUND(end_index, src.length());
Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
for (word i = start_index, cp_len; i < end_index; i += cp_len) {
int32_t cp = src.codePointAt(i, &cp_len);
data[writer->pos++] = cp;
}
return 0;
}
// Facebook: D13491655
// Most of the following helper functions, along with PyUnicode_FromFormat and
// PyUnicode_FromFormatV are directly imported from CPython. The following
// modifications have been made:
//
// - Since our internal strings are always UTF-8, we don't need maxchar or any
// of the helper functions required to calculate it
//
// - Since our strings are immutable, we can't use PyUnicode_Fill. However,
// since the helper functions always use it to append to strings, we can get
// away with just writing characters in a loop.
//
// - Since our internal strings are always UTF-8, there is no need to check
// a character's 'Kind' before writing it to a string
static int writeStr(_PyUnicodeWriter* writer, PyObject* str, Py_ssize_t width,
Py_ssize_t precision) {
if (PyUnicode_READY(str) == -1) return -1;
Py_ssize_t length = PyUnicode_GET_LENGTH(str);
if ((precision == -1 || precision >= length) && width <= length) {
return _PyUnicodeWriter_WriteStr(writer, str);
}
if (precision != -1) length = Py_MIN(precision, length);
Py_ssize_t arglen = Py_MAX(length, width);
// Facebook: Our internal strings are always UTF-8, don't need maxchar
// (D13491655)
if (_PyUnicodeWriter_Prepare(writer, arglen, 0) == -1) return -1;
if (width > length) {
Py_ssize_t fill = width - length;
// Facebook: Our internal strings are immutable, can't use PyUnicode_Fill
// (D13491655)
for (Py_ssize_t i = 0; i < fill; ++i) {
if (_PyUnicodeWriter_WriteCharInline(writer, ' ') == -1) return -1;
}
}
// Facebook: Since we only have one internal representation, we don't have
// to worry about changing a string's 'Kind' (D13491655)
return _PyUnicodeWriter_WriteSubstring(writer, str, 0, length);
}
static int writeCStr(_PyUnicodeWriter* writer, const char* str,
Py_ssize_t width, Py_ssize_t precision) {
Py_ssize_t length = std::strlen(str);
if (precision != -1) length = Py_MIN(length, precision);
PyObject* unicode =
PyUnicode_DecodeUTF8Stateful(str, length, "replace", nullptr);
if (unicode == nullptr) return -1;
int res = writeStr(writer, unicode, width, -1);
Py_DECREF(unicode);
return res;
}
static const char* writeArg(_PyUnicodeWriter* writer, const char* f,
va_list* vargs) {
const char* p = f;
f++;
int zeropad = 0;
if (*f == '0') {
zeropad = 1;
f++;
}
// parse the width.precision part, e.g. "%2.5s" => width=2, precision=5
Py_ssize_t width = -1;
if (Py_ISDIGIT(static_cast<unsigned>(*f))) {
width = *f - '0';
f++;
while (Py_ISDIGIT(static_cast<unsigned>(*f))) {
if (width > (kMaxWord - (static_cast<int>(*f) - '0')) / 10) {
Thread::current()->raiseWithFmt(LayoutId::kValueError, "width too big");
return nullptr;
}
width = (width * 10) + (*f - '0');
f++;
}
}
Py_ssize_t precision = -1;
if (*f == '.') {
f++;
if (Py_ISDIGIT(static_cast<unsigned>(*f))) {
precision = (*f - '0');
f++;
while (Py_ISDIGIT(static_cast<unsigned>(*f))) {
if (precision > (kMaxWord - (static_cast<int>(*f) - '0')) / 10) {
Thread::current()->raiseWithFmt(LayoutId::kValueError,
"precision too big");
return nullptr;
}
precision = (precision * 10) + (*f - '0');
f++;
}
}
if (*f == '%') {
// "%.3%s" => f points to "3"
f--;
}
}
if (*f == '\0') {
// bogus format "%.123" => go backward, f points to "3"
f--;
}
// Handle %ld, %lu, %lld and %llu.
int longflag = 0;
int longlongflag = 0;
int size_tflag = 0;
if (*f == 'l') {
if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
longflag = 1;
++f;
} else if (f[1] == 'l' && (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
longlongflag = 1;
f += 2;
}
}
// handle the size_t flag.
else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
size_tflag = 1;
++f;
}
if (f[1] == '\0') writer->overallocate = 0;
switch (*f) {
case 'c': {
int ordinal = va_arg(*vargs, int);
if (ordinal < 0 || ordinal > kMaxUnicode) {
Thread::current()->raiseWithFmt(
LayoutId::kOverflowError,
"character argument not in range(0x110000)");
return nullptr;
}
if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) return nullptr;
break;
}
case 'i':
case 'd':
case 'u':
case 'x': {
// used by sprintf
char buffer[kMaxLongLongChars];
Py_ssize_t len;
if (*f == 'u') {
if (longflag) {
len = std::sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
} else if (longlongflag) {
len =
std::sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
} else if (size_tflag) {
len = std::sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
va_arg(*vargs, size_t));
} else {
len = std::sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
}
} else if (*f == 'x') {
len = std::sprintf(buffer, "%x", va_arg(*vargs, int));
} else {
if (longflag) {
len = std::sprintf(buffer, "%li", va_arg(*vargs, long));
} else if (longlongflag) {
len = std::sprintf(buffer, "%lli", va_arg(*vargs, long long));
} else if (size_tflag) {
len = std::sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
va_arg(*vargs, Py_ssize_t));
} else {
len = std::sprintf(buffer, "%i", va_arg(*vargs, int));
}
}
DCHECK(len >= 0, "len must be >= 0");
if (precision < len) precision = len;
Py_ssize_t arglen = Py_MAX(precision, width);
if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) return nullptr;
if (width > precision) {
Py_ssize_t fill = width - precision;
Py_UCS4 fillchar = zeropad ? '0' : ' ';
// Facebook: Our internal strings are immutable, can't use
// PyUnicode_Fill (D13491655)
for (Py_ssize_t i = 0; i < fill; ++i) {
if (_PyUnicodeWriter_WriteCharInline(writer, fillchar) == -1) {
return nullptr;
}
}
}
if (precision > len) {
Py_ssize_t fill = precision - len;
// Facebook: Our internal strings are immutable, can't use
// PyUnicode_Fill (D13491655)
for (Py_ssize_t i = 0; i < fill; ++i) {
if (_PyUnicodeWriter_WriteCharInline(writer, '0') == -1) {
return nullptr;
}
}
}
if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) {
return nullptr;
}
break;
}
case 'p': {
char number[kMaxLongLongChars];
Py_ssize_t len = std::sprintf(number, "%p", va_arg(*vargs, void*));
DCHECK(len >= 0, "len must be >= 0");
// %p is ill-defined: ensure leading 0x.
if (number[1] == 'X') {
number[1] = 'x';
} else if (number[1] != 'x') {
std::memmove(number + 2, number, std::strlen(number) + 1);
number[0] = '0';
number[1] = 'x';
len += 2;
}
if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) {
return nullptr;
}
break;
}
case 's': {
// UTF-8
const char* s = va_arg(*vargs, const char*);
if (writeCStr(writer, s, width, precision) < 0) {
return nullptr;
}
break;
}
case 'U': {
PyObject* obj = va_arg(*vargs, PyObject*);
// This used to call _PyUnicode_CHECK, which is deprecated, and which we
// have not imported.
DCHECK(obj, "obj must not be null");
if (writeStr(writer, obj, width, precision) == -1) {
return nullptr;
}
break;
}
case 'V': {
PyObject* obj = va_arg(*vargs, PyObject*);
const char* str = va_arg(*vargs, const char*);
if (obj) {
// This used to DCHECK _PyUnicode_CHECK, which is deprecated, and which
// we have not imported.
if (writeStr(writer, obj, width, precision) == -1) {
return nullptr;
}
} else {
DCHECK(str != nullptr, "str must not be null");
if (writeCStr(writer, str, width, precision) < 0) {
return nullptr;
}
}
break;
}
case 'S': {
PyObject* obj = va_arg(*vargs, PyObject*);
DCHECK(obj, "obj must not be null");
PyObject* str = PyObject_Str(obj);
if (!str) return nullptr;
if (writeStr(writer, str, width, precision) == -1) {
Py_DECREF(str);
return nullptr;
}
Py_DECREF(str);
break;
}
case 'R': {
PyObject* obj = va_arg(*vargs, PyObject*);
DCHECK(obj, "obj must not be null");
PyObject* repr = PyObject_Repr(obj);
if (!repr) return nullptr;
if (writeStr(writer, repr, width, precision) == -1) {
Py_DECREF(repr);
return nullptr;
}
Py_DECREF(repr);
break;
}
case 'A': {
PyObject* obj = va_arg(*vargs, PyObject*);
DCHECK(obj, "obj must not be null");
PyObject* ascii = PyObject_ASCII(obj);
if (!ascii) return nullptr;
if (writeStr(writer, ascii, width, precision) == -1) {
Py_DECREF(ascii);
return nullptr;
}
Py_DECREF(ascii);
break;
}
case '%':
if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) return nullptr;
break;
default: {
// if we stumble upon an unknown formatting code, copy the rest
// of the format string to the output string. (we cannot just
// skip the code, since there's no way to know what's in the
// argument list)
Py_ssize_t len = std::strlen(p);
if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) {
return nullptr;
}
f = p + len;
return f;
}
}
f++;
return f;
}
PY_EXPORT int _PyUnicode_EqualToASCIIString(PyObject* unicode,
const char* c_str) {
DCHECK(unicode, "nullptr argument");
DCHECK(c_str, "nullptr argument");
RawObject obj = ApiHandle::fromPyObject(unicode)->asObject();
DCHECK(Thread::current()->runtime()->isInstanceOfStr(obj),
"non-str argument");
return strUnderlying(obj).equalsCStr(c_str);
}
PY_EXPORT int _PyUnicode_EQ(PyObject* aa, PyObject* bb) {
Thread* thread = Thread::current();
HandleScope scope(thread);
Object obj_aa(&scope, ApiHandle::fromPyObject(aa)->asObject());
Object obj_bb(&scope, ApiHandle::fromPyObject(bb)->asObject());
Str lhs(&scope, strUnderlying(*obj_aa));
Str rhs(&scope, strUnderlying(*obj_bb));
return lhs.equals(*rhs);
}
PY_EXPORT size_t Py_UNICODE_strlen(const Py_UNICODE* u) {
DCHECK(u != nullptr, "u should not be null");
return std::wcslen(u);
}
PY_EXPORT int _PyUnicode_Ready(PyObject* /* unicode */) { return 0; }
PY_EXPORT int PyUnicode_CheckExact_Func(PyObject* obj) {
return ApiHandle::fromPyObject(obj)->asObject().isStr();
}
PY_EXPORT int PyUnicode_Check_Func(PyObject* obj) {
return Thread::current()->runtime()->isInstanceOfStr(
ApiHandle::fromPyObject(obj)->asObject());
}
PY_EXPORT PyObject* PyUnicode_FromString(const char* c_string) {
Runtime* runtime = Thread::current()->runtime();
return ApiHandle::newReference(runtime, runtime->newStrFromCStr(c_string));
}
// Look for a surrogate codepoint in str[start:]. Note that start is a byte
// offset. Return the first index found in that range, or -1 if not found.
static word strFindSurrogateCodepoint(const Str& str, word start) {
word length = str.length();
word byte_index = start;
while (byte_index < length) {
word num_bytes;
int32_t codepoint = str.codePointAt(byte_index, &num_bytes);
if (Unicode::isSurrogate(codepoint)) {
return byte_index;
}
byte_index += num_bytes;
}
return -1;
}
PY_EXPORT const char* PyUnicode_AsUTF8AndSize(PyObject* pyunicode,
Py_ssize_t* size) {
Thread* thread = Thread::current();
if (pyunicode == nullptr) {
thread->raiseBadArgument();
return nullptr;
}
HandleScope scope(thread);
ApiHandle* handle = ApiHandle::fromPyObject(pyunicode);
Object obj(&scope, handle->asObject());
Runtime* runtime = thread->runtime();
if (!runtime->isInstanceOfStr(*obj)) {
thread->raiseBadInternalCall();
return nullptr;
}
Str str(&scope, strUnderlying(*obj));
word length = str.length();
if (size != nullptr) *size = length;
if (void* cache = handle->cache(runtime)) {
return static_cast<char*>(cache);
}
word surr_index = strFindSurrogateCodepoint(str, 0);
if (surr_index != -1) {
Object encoding(&scope, SmallStr::fromCStr("utf-8"));
Object start(&scope, SmallInt::fromWord(surr_index));
Object end(&scope, SmallInt::fromWord(surr_index + 1));
Object reason(&scope, runtime->newStrFromCStr("surrogates not allowed"));
Object exc(&scope,
thread->invokeFunction5(ID(builtins), ID(UnicodeEncodeError),
encoding, str, start, end, reason));
Object err(&scope,
thread->invokeFunction1(ID(_codecs), ID(strict_errors), exc));
DCHECK(err.isErrorException(),
"_codecs.strict_errors should raise an exception");
return nullptr;
}
byte* result = static_cast<byte*>(std::malloc(length + 1));
str.copyTo(result, length);
result[length] = '\0';
handle->setCache(runtime, result);
handle->setBorrowedNoImmediate();
return reinterpret_cast<char*>(result);
}
PY_EXPORT const char* PyUnicode_AsUTF8(PyObject* unicode) {
return PyUnicode_AsUTF8AndSize(unicode, nullptr);
}
PY_EXPORT PyObject* PyUnicode_FromStringAndSize(const char* u,
Py_ssize_t size) {
Thread* thread = Thread::current();
if (size < 0) {
thread->raiseWithFmt(LayoutId::kSystemError,
"Negative size passed to PyUnicode_FromStringAndSize");
return nullptr;
}
if (u == nullptr && size != 0) {
// TODO(T36562134): Implement _PyUnicode_New
UNIMPLEMENTED("_PyUnicode_New");
}
const byte* data = reinterpret_cast<const byte*>(u);
Runtime* runtime = thread->runtime();
return ApiHandle::newReference(
runtime, runtime->newStrWithAll(View<byte>(data, size)));
}
PY_EXPORT PyObject* PyUnicode_EncodeFSDefault(PyObject* unicode) {
// TODO(T40363016): Allow arbitrary encodings instead of defaulting to utf-8
return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
}
PY_EXPORT PyObject* PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) {
Thread* thread = Thread::current();
// Since CPython optimizes for empty string, we must do so as well to make
// sure we don't fail if maxchar is invalid
if (size == 0) {
return ApiHandle::newReference(thread->runtime(), Str::empty());
}
if (maxchar > kMaxUnicode) {
thread->raiseWithFmt(LayoutId::kSystemError,
"invalid maximum character passed to PyUnicode_New");
return nullptr;
}
if (size < 0) {
thread->raiseWithFmt(LayoutId::kSystemError,
"Negative size passed to PyUnicode_New");
return nullptr;
}
// TODO(T41498010): Add modifiable string state
UNIMPLEMENTED("Cannot create mutable strings yet");
}
PY_EXPORT void PyUnicode_Append(PyObject** p_left, PyObject* right) {
if (p_left == nullptr) {
if (!PyErr_Occurred()) {
PyErr_BadInternalCall();
}
return;
}
PyObject* left = *p_left;
if (left == nullptr || right == nullptr || !PyUnicode_Check(left) ||
!PyUnicode_Check(right)) {
if (!PyErr_Occurred()) {
PyErr_BadInternalCall();
}
Py_CLEAR(*p_left);
return;
}
*p_left = PyUnicode_Concat(left, right);
Py_DECREF(left);
}
PY_EXPORT void PyUnicode_AppendAndDel(PyObject** p_left, PyObject* right) {
PyUnicode_Append(p_left, right);
Py_XDECREF(right);
}
PY_EXPORT PyObject* _PyUnicode_AsASCIIString(PyObject* unicode,
const char* errors) {
DCHECK(unicode != nullptr, "unicode cannot be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object str(&scope, ApiHandle::fromPyObject(unicode)->asObject());
if (!runtime->isInstanceOfStr(*str)) {
thread->raiseBadArgument();
return nullptr;
}
Object errors_obj(&scope, symbolFromError(thread, errors));
Object tuple_obj(&scope, thread->invokeFunction2(
ID(_codecs), ID(ascii_encode), str, errors_obj));
if (tuple_obj.isError()) {
return nullptr;
}
Tuple tuple(&scope, *tuple_obj);
return ApiHandle::newReference(runtime, tuple.at(0));
}
PY_EXPORT PyObject* PyUnicode_AsASCIIString(PyObject* unicode) {
return _PyUnicode_AsASCIIString(unicode, "strict");
}
PY_EXPORT PyObject* PyUnicode_AsCharmapString(PyObject* /* e */,
PyObject* /* g */) {
UNIMPLEMENTED("PyUnicode_AsCharmapString");
}
PY_EXPORT PyObject* PyUnicode_AsDecodedObject(PyObject* /* e */,
const char* /* g */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_AsDecodedObject");
}
PY_EXPORT PyObject* PyUnicode_AsDecodedUnicode(PyObject* /* e */,
const char* /* g */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_AsDecodedUnicode");
}
PY_EXPORT PyObject* PyUnicode_AsEncodedObject(PyObject* /* e */,
const char* /* g */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_AsEncodedObject");
}
PY_EXPORT PyObject* PyUnicode_AsEncodedString(PyObject* unicode,
const char* encoding,
const char* errors) {
DCHECK(unicode != nullptr, "unicode cannot be null");
if (encoding == nullptr) {
return _PyUnicode_AsUTF8String(unicode, errors);
}
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object str(&scope, ApiHandle::fromPyObject(unicode)->asObject());
if (!runtime->isInstanceOfStr(*str)) {
thread->raiseBadArgument();
return nullptr;
}
Object encoding_obj(&scope, runtime->newStrFromCStr(encoding));
Object errors_obj(&scope, errors == nullptr
? Unbound::object()
: symbolFromError(thread, errors));
Object result(&scope, thread->invokeFunction3(ID(_codecs), ID(encode), str,
encoding_obj, errors_obj));
if (result.isError()) {
return nullptr;
}
if (runtime->isInstanceOfBytes(*result)) {
return ApiHandle::newReference(runtime, *result);
}
if (runtime->isInstanceOfBytearray(*result)) {
// Equivalent to calling PyErr_WarnFormat
if (!ensureBuiltinModuleById(thread, ID(warnings)).isErrorException()) {
Object category(&scope, runtime->typeAt(LayoutId::kRuntimeWarning));
Object message(&scope,
runtime->newStrFromFmt(
"encoder %s returned bytearray instead of bytes; "
"use codecs.encode() to encode to arbitrary types",
encoding));
Object stack_level(&scope, runtime->newInt(1));
Object source(&scope, NoneType::object());
Object err(&scope,
thread->invokeFunction4(ID(warnings), ID(warn), message,
category, stack_level, source));
if (err.isErrorException()) {
thread->clearPendingException();
}
}
Bytearray result_bytearray(&scope, *result);
return ApiHandle::newReference(runtime,
bytearrayAsBytes(thread, result_bytearray));
}
thread->raiseWithFmt(LayoutId::kTypeError,
"'%s' encoder returned '%T' instead of 'bytes'; "
"use codecs.encode() to encode to arbitrary types",
encoding, *result);
return nullptr;
}
PY_EXPORT PyObject* PyUnicode_AsEncodedUnicode(PyObject* /* e */,
const char* /* g */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_AsEncodedUnicode");
}
PY_EXPORT PyObject* _PyUnicode_AsLatin1String(PyObject* unicode,
const char* errors) {
DCHECK(unicode != nullptr, "unicode cannot be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object str(&scope, ApiHandle::fromPyObject(unicode)->asObject());
if (!runtime->isInstanceOfStr(*str)) {
thread->raiseBadArgument();
return nullptr;
}
Object errors_obj(&scope, symbolFromError(thread, errors));
Object tuple_obj(&scope,
thread->invokeFunction2(ID(_codecs), ID(latin_1_encode), str,
errors_obj));
if (tuple_obj.isError()) {
return nullptr;
}
Tuple tuple(&scope, *tuple_obj);
return ApiHandle::newReference(runtime, tuple.at(0));
}
PY_EXPORT PyObject* PyUnicode_AsLatin1String(PyObject* unicode) {
return _PyUnicode_AsLatin1String(unicode, "strict");
}
PY_EXPORT PyObject* PyUnicode_AsMBCSString(PyObject* /* e */) {
UNIMPLEMENTED("PyUnicode_AsMBCSString");
}
PY_EXPORT PyObject* PyUnicode_AsRawUnicodeEscapeString(PyObject* /* e */) {
UNIMPLEMENTED("PyUnicode_AsRawUnicodeEscapeString");
}
PY_EXPORT Py_UCS4* PyUnicode_AsUCS4(PyObject* u, Py_UCS4* buffer,
Py_ssize_t buflen, int copy_null) {
if (buffer == nullptr || buflen < 0) {
PyErr_BadInternalCall();
return nullptr;
}
Thread* thread = Thread::current();
HandleScope scope(thread);
Object obj(&scope, ApiHandle::fromPyObject(u)->asObject());
if (!thread->runtime()->isInstanceOfStr(*obj)) {
thread->raiseBadArgument();
}
Str str(&scope, strUnderlying(*obj));
word num_codepoints = str.codePointLength();
word target_buflen = copy_null ? num_codepoints + 1 : num_codepoints;
if (buflen < target_buflen) {
thread->raiseWithFmt(LayoutId::kSystemError,
"string is longer than the buffer");
if (copy_null != 0 && 0 < buflen) {
buffer[0] = 0;
}
return nullptr;
}
for (word i = 0, offset = 0; i < num_codepoints; i++) {
word num_bytes;
buffer[i] = str.codePointAt(offset, &num_bytes);
offset += num_bytes;
}
if (copy_null != 0) buffer[num_codepoints] = 0;
return buffer;
}
PY_EXPORT Py_UCS4* PyUnicode_AsUCS4Copy(PyObject* str) {
Py_ssize_t len = PyUnicode_GET_LENGTH(str) + 1;
Py_UCS4* result = static_cast<Py_UCS4*>(PyMem_Malloc(len * sizeof(Py_UCS4)));
if (result == nullptr) {
PyErr_NoMemory();
return nullptr;
}
return PyUnicode_AsUCS4(str, result, len, 1);
}
PY_EXPORT PyObject* PyUnicode_AsUTF16String(PyObject* unicode) {
return _PyUnicode_EncodeUTF16(unicode, nullptr, 0);
}
PY_EXPORT PyObject* PyUnicode_AsUTF32String(PyObject* unicode) {
return _PyUnicode_EncodeUTF32(unicode, nullptr, 0);
}
PY_EXPORT PyObject* PyUnicode_AsUTF8String(PyObject* unicode) {
return _PyUnicode_AsUTF8String(unicode, "strict");
}
PY_EXPORT PyObject* PyUnicode_AsUnicodeEscapeString(PyObject* /* e */) {
UNIMPLEMENTED("PyUnicode_AsUnicodeEscapeString");
}
PY_EXPORT Py_ssize_t PyUnicode_AsWideChar(PyObject* str, wchar_t* result,
Py_ssize_t size) {
Thread* thread = Thread::current();
if (str == nullptr) {
thread->raiseBadInternalCall();
return -1;
}
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Runtime* runtime = thread->runtime();
if (!runtime->isInstanceOfStr(*str_obj)) {
thread->raiseWithFmt(
LayoutId::kTypeError,
"PyUnicode_AsWideChar requires 'str' object but received a '%T'",
&str_obj);
return -1;
}
Str str_str(&scope, strUnderlying(*str_obj));
Py_ssize_t num_code_points = str_str.codePointLength();
if (size > num_code_points) {
size = num_code_points + 1;
} else {
num_code_points = size;
}
{
word byte_count = str_str.length();
for (word byte_index = 0, wchar_index = 0, num_bytes = 0;
byte_index < byte_count && wchar_index < size;
byte_index += num_bytes, wchar_index += 1) {
int32_t cp = str_str.codePointAt(byte_index, &num_bytes);
static_assert(sizeof(wchar_t) == sizeof(cp), "Requires 32bit wchar_t");
if (result != nullptr) {
result[wchar_index] = static_cast<wchar_t>(cp);
}
}
if (num_code_points < size) {
result[num_code_points] = '\0';
}
}
return num_code_points;
}
PY_EXPORT wchar_t* PyUnicode_AsWideCharString(PyObject* str,
Py_ssize_t* result_len) {
Thread* thread = Thread::current();
if (str == nullptr) {
thread->raiseBadInternalCall();
return nullptr;
}
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Runtime* runtime = thread->runtime();
if (!runtime->isInstanceOfStr(*str_obj)) {
thread->raiseWithFmt(
LayoutId::kTypeError,
"PyUnicode_AsWideChar requires 'str' object but received a '%T'",
&str_obj);
return nullptr;
}
Str str_str(&scope, strUnderlying(*str_obj));
word length = str_str.codePointLength();
wchar_t* result =
static_cast<wchar_t*>(PyMem_Malloc((length + 1) * sizeof(wchar_t)));
if (result == nullptr) {
thread->raiseMemoryError();
return nullptr;
}
{
word byte_count = str_str.length();
for (word byte_index = 0, wchar_index = 0, num_bytes = 0;
byte_index < byte_count && wchar_index < length + 1;
byte_index += num_bytes, wchar_index += 1) {
int32_t cp = str_str.codePointAt(byte_index, &num_bytes);
if (cp == '\0') {
PyMem_Free(result);
thread->raiseWithFmt(LayoutId::kValueError, "embedded null character");
return nullptr;
}
static_assert(sizeof(wchar_t) == sizeof(cp), "Requires 32bit wchar_t");
result[wchar_index] = static_cast<wchar_t>(cp);
}
result[length] = '\0';
}
if (result_len != nullptr) {
*result_len = length;
}
return result;
}
PY_EXPORT PyObject* PyUnicode_BuildEncodingMap(PyObject* /* g */) {
UNIMPLEMENTED("PyUnicode_BuildEncodingMap");
}
PY_EXPORT int PyUnicode_Compare(PyObject* left, PyObject* right) {
Thread* thread = Thread::current();
if (left == nullptr || right == nullptr) {
thread->raiseBadInternalCall();
return -1;
}
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Object left_obj(&scope, ApiHandle::fromPyObject(left)->asObject());
Object right_obj(&scope, ApiHandle::fromPyObject(right)->asObject());
if (runtime->isInstanceOfStr(*left_obj) &&
runtime->isInstanceOfStr(*right_obj)) {
Str left_str(&scope, strUnderlying(*left_obj));
Str right_str(&scope, strUnderlying(*right_obj));
word result = left_str.compare(*right_str);
return result > 0 ? 1 : (result < 0 ? -1 : 0);
}
thread->raiseWithFmt(LayoutId::kTypeError, "Can't compare %T and %T",
&left_obj, &right_obj);
return -1;
}
PY_EXPORT int PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) {
Thread* thread = Thread::current();
HandleScope scope(thread);
Object obj(&scope, ApiHandle::fromPyObject(uni)->asObject());
Str str_obj(&scope, strUnderlying(*obj));
// TODO(atalaba): Allow for proper comparison against Latin-1 strings. For
// example, in CPython: "\xC3\xA9" (UTF-8) == "\xE9" (Latin-1), and
// "\xE9 longer" > "\xC3\xA9".
return str_obj.compareCStr(str);
}
PY_EXPORT PyObject* PyUnicode_Concat(PyObject* left, PyObject* right) {
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object left_obj(&scope, ApiHandle::fromPyObject(left)->asObject());
Object right_obj(&scope, ApiHandle::fromPyObject(right)->asObject());
if (!runtime->isInstanceOfStr(*left_obj) ||
!runtime->isInstanceOfStr(*right_obj)) {
thread->raiseWithFmt(LayoutId::kTypeError,
"can only concatenate str to str");
return nullptr;
}
Str left_str(&scope, strUnderlying(*left_obj));
Str right_str(&scope, strUnderlying(*right_obj));
word dummy;
if (__builtin_add_overflow(left_str.length(), right_str.length(), &dummy)) {
thread->raiseWithFmt(LayoutId::kOverflowError,
"strings are too large to concat");
return nullptr;
}
return ApiHandle::newReference(
runtime, runtime->strConcat(thread, left_str, right_str));
}
PY_EXPORT int PyUnicode_Contains(PyObject* str, PyObject* substr) {
DCHECK(str != nullptr, "str should not be null");
DCHECK(substr != nullptr, "substr should not be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Object substr_obj(&scope, ApiHandle::fromPyObject(substr)->asObject());
Object result(&scope,
thread->invokeMethodStatic2(LayoutId::kStr, ID(__contains__),
str_obj, substr_obj));
if (result.isError()) {
if (result.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kTypeError,
"could not call str.__contains__");
}
return -1;
}
DCHECK(result.isBool(), "result of __contains__ should be bool");
return Bool::cast(*result).value();
}
PY_EXPORT Py_ssize_t PyUnicode_CopyCharacters(PyObject*, Py_ssize_t, PyObject*,
Py_ssize_t, Py_ssize_t) {
UNIMPLEMENTED("PyUnicode_CopyCharacters");
}
PY_EXPORT Py_ssize_t PyUnicode_Count(PyObject* /* r */, PyObject* /* r */,
Py_ssize_t /* t */, Py_ssize_t /* d */) {
UNIMPLEMENTED("PyUnicode_Count");
}
PY_EXPORT PyObject* PyUnicode_Decode(const char* c_str, Py_ssize_t size,
const char* encoding, const char* errors) {
DCHECK(c_str != nullptr, "c_str cannot be null");
if (encoding == nullptr) {
return PyUnicode_DecodeUTF8Stateful(c_str, size, errors, nullptr);
}
Thread* thread = Thread::current();
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>(
reinterpret_cast<const byte*>(c_str), size)));
Object errors_obj(&scope, symbolFromError(thread, errors));
Object encoding_obj(&scope, runtime->newStrFromCStr(encoding));
Object result(&scope, thread->invokeFunction3(ID(_codecs), ID(decode), bytes,
encoding_obj, errors_obj));
if (result.isError()) {
return nullptr;
}
return ApiHandle::newReference(runtime, *result);
}
PY_EXPORT PyObject* PyUnicode_DecodeASCII(const char* c_str, Py_ssize_t size,
const char* errors) {
Thread* thread = Thread::current();
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>(
reinterpret_cast<const byte*>(c_str), size)));
Str errors_obj(&scope, symbolFromError(thread, errors));
Object result_obj(
&scope, thread->invokeFunction2(ID(_codecs), ID(ascii_decode), bytes,
errors_obj));
if (result_obj.isError()) {
if (result_obj.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kSystemError,
"could not call _codecs.ascii_decode");
}
return nullptr;
}
Tuple result(&scope, *result_obj);
return ApiHandle::newReference(runtime, result.at(0));
}
PY_EXPORT PyObject* PyUnicode_DecodeCharmap(const char* /* s */,
Py_ssize_t /* e */,
PyObject* /* g */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_DecodeCharmap");
}
PY_EXPORT PyObject* PyUnicode_DecodeCodePageStateful(int /* e */,
const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */,
Py_ssize_t* /* d */) {
UNIMPLEMENTED("PyUnicode_DecodeCodePageStateful");
}
PY_EXPORT PyObject* PyUnicode_DecodeFSDefault(const char* c_str) {
Runtime* runtime = Thread::current()->runtime();
return ApiHandle::newReference(runtime, runtime->newStrFromCStr(c_str));
}
PY_EXPORT PyObject* PyUnicode_DecodeFSDefaultAndSize(const char* c_str,
Py_ssize_t size) {
Runtime* runtime = Thread::current()->runtime();
View<byte> str(reinterpret_cast<const byte*>(c_str), size);
return ApiHandle::newReference(runtime, runtime->newStrWithAll(str));
}
PY_EXPORT PyObject* PyUnicode_DecodeLatin1(const char* c_str, Py_ssize_t size,
const char* /* errors */) {
Thread* thread = Thread::current();
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>(
reinterpret_cast<const byte*>(c_str), size)));
Object result_obj(
&scope, thread->invokeFunction1(ID(_codecs), ID(latin_1_decode), bytes));
if (result_obj.isError()) {
if (result_obj.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kSystemError,
"could not call _codecs.latin_1_decode");
}
return nullptr;
}
Tuple result(&scope, *result_obj);
return ApiHandle::newReference(runtime, result.at(0));
}
PY_EXPORT PyObject* PyUnicode_DecodeLocale(const char* str,
const char* errors) {
return PyUnicode_DecodeLocaleAndSize(str, std::strlen(str), errors);
}
PY_EXPORT PyObject* PyUnicode_DecodeLocaleAndSize(const char* str,
Py_ssize_t len,
const char* errors) {
_Py_error_handler surrogateescape;
if (errors == nullptr || std::strcmp(errors, "strict") == 0) {
surrogateescape = _Py_ERROR_STRICT;
} else if (std::strcmp(errors, "surrogateescape") == 0) {
surrogateescape = _Py_ERROR_SURROGATEESCAPE;
} else {
Thread::current()->raiseWithFmt(
LayoutId::kValueError,
"only 'strict' and 'surrogateescape' error handlers "
"are supported, not '%s'",
errors);
return nullptr;
}
if (str[len] != '\0' || static_cast<size_t>(len) != std::strlen(str)) {
Thread::current()->raiseWithFmt(LayoutId::kValueError,
"embedded null byte");
return nullptr;
}
wchar_t* wstr;
size_t wlen;
const char* reason;
int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, 1, surrogateescape);
if (res != 0) {
if (res == -2) {
PyObject* exc =
PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", "locale",
str, len, wlen, wlen + 1, reason);
if (exc != nullptr) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
}
} else {
PyErr_NoMemory();
}
return nullptr;
}
PyObject* unicode = PyUnicode_FromWideChar(wstr, wlen);
PyMem_RawFree(wstr);
return unicode;
}
PY_EXPORT PyObject* PyUnicode_DecodeMBCS(const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_DecodeMBCS");
}
PY_EXPORT PyObject* PyUnicode_DecodeMBCSStateful(const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */,
Py_ssize_t* /* d */) {
UNIMPLEMENTED("PyUnicode_DecodeMBCSStateful");
}
PY_EXPORT PyObject* PyUnicode_DecodeRawUnicodeEscape(const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_DecodeRawUnicodeEscape");
}
PY_EXPORT PyObject* PyUnicode_DecodeUTF16(const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */, int* /* r */) {
UNIMPLEMENTED("PyUnicode_DecodeUTF16");
}
PY_EXPORT PyObject* PyUnicode_DecodeUTF16Stateful(const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */,
int* /* r */,
Py_ssize_t* /* d */) {
UNIMPLEMENTED("PyUnicode_DecodeUTF16Stateful");
}
PY_EXPORT PyObject* PyUnicode_DecodeUTF32(const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */, int* /* r */) {
UNIMPLEMENTED("PyUnicode_DecodeUTF32");
}
PY_EXPORT PyObject* PyUnicode_DecodeUTF32Stateful(const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */,
int* /* r */,
Py_ssize_t* /* d */) {
UNIMPLEMENTED("PyUnicode_DecodeUTF32Stateful");
}
PY_EXPORT PyObject* PyUnicode_DecodeUTF7(const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_DecodeUTF7");
}
PY_EXPORT PyObject* PyUnicode_DecodeUTF7Stateful(const char* /* s */,
Py_ssize_t /* e */,
const char* /* s */,
Py_ssize_t* /* d */) {
UNIMPLEMENTED("PyUnicode_DecodeUTF7Stateful");
}
PY_EXPORT PyObject* PyUnicode_DecodeUTF8(const char* c_str, Py_ssize_t size,
const char* errors) {
return PyUnicode_DecodeUTF8Stateful(c_str, size, errors, nullptr);
}
PY_EXPORT PyObject* PyUnicode_DecodeUTF8Stateful(const char* c_str,
Py_ssize_t size,
const char* errors,
Py_ssize_t* consumed) {
DCHECK(c_str != nullptr, "c_str cannot be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
word i = 0;
const byte* byte_str = reinterpret_cast<const byte*>(c_str);
for (; i < size; ++i) {
if (byte_str[i] > kMaxASCII) break;
}
if (i == size) {
if (consumed != nullptr) {
*consumed = size;
}
return ApiHandle::newReference(runtime,
runtime->newStrWithAll({byte_str, size}));
}
Object bytes(&scope, runtime->newBytesWithAll(View<byte>({byte_str, size})));
Object errors_obj(&scope, symbolFromError(thread, errors));
Object is_final(&scope, Bool::fromBool(consumed == nullptr));
Object result_obj(
&scope, thread->invokeFunction3(ID(_codecs), ID(utf_8_decode), bytes,
errors_obj, is_final));
if (result_obj.isError()) {
if (result_obj.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kSystemError,
"could not call _codecs._utf_8_decode_stateful");
}
return nullptr;
}
Tuple result(&scope, *result_obj);
if (consumed != nullptr) {
*consumed = Int::cast(result.at(1)).asWord();
}
return ApiHandle::newReference(runtime, result.at(0));
}
PY_EXPORT PyObject* PyUnicode_DecodeUnicodeEscape(const char* c_str,
Py_ssize_t size,
const char* errors) {
DCHECK(c_str != nullptr, "c_str cannot be null");
const char* first_invalid_escape;
PyObject* result = _PyUnicode_DecodeUnicodeEscape(c_str, size, errors,
&first_invalid_escape);
if (result == nullptr) {
return nullptr;
}
if (first_invalid_escape != nullptr) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid escape sequence '\\%c'",
static_cast<byte>(*first_invalid_escape)) < 0) {
Py_DECREF(result);
return nullptr;
}
}
return result;
}
PY_EXPORT PyObject* _PyUnicode_DecodeUnicodeEscape(
const char* c_str, Py_ssize_t size, const char* errors,
const char** first_invalid_escape) {
DCHECK(c_str != nullptr, "c_str cannot be null");
DCHECK(first_invalid_escape != nullptr,
"first_invalid_escape cannot be null");
// So we can remember if we've seen an invalid escape char or not
*first_invalid_escape = nullptr;
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object bytes(&scope, runtime->newBytesWithAll(View<byte>(
reinterpret_cast<const byte*>(c_str), size)));
Object errors_obj(&scope, symbolFromError(thread, errors));
Object result_obj(
&scope,
thread->invokeFunction2(ID(_codecs), ID(_unicode_escape_decode_stateful),
bytes, errors_obj));
if (result_obj.isError()) {
if (result_obj.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kSystemError,
"could not call _codecs.unicode_escape_decode");
}
return nullptr;
}
Tuple result(&scope, *result_obj);
Int first_invalid_index(&scope, result.at(2));
word invalid_index = first_invalid_index.asWord();
if (invalid_index > -1) {
*first_invalid_escape = c_str + invalid_index;
}
return ApiHandle::newReference(runtime, result.at(0));
}
PY_EXPORT PyObject* PyUnicode_EncodeCodePage(int /* e */, PyObject* /* e */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_EncodeCodePage");
}
PY_EXPORT PyObject* PyUnicode_EncodeLocale(PyObject* unicode,
const char* errors) {
_Py_error_handler surrogateescape;
if (errors == nullptr || std::strcmp(errors, "strict") == 0) {
surrogateescape = _Py_ERROR_STRICT;
} else if (std::strcmp(errors, "surrogateescape") == 0) {
surrogateescape = _Py_ERROR_SURROGATEESCAPE;
} else {
Thread::current()->raiseWithFmt(
LayoutId::kValueError,
"only 'strict' and 'surrogateescape' error handlers "
"are supported, not '%s'",
errors);
return nullptr;
}
Py_ssize_t wlen;
wchar_t* wstr = PyUnicode_AsWideCharString(unicode, &wlen);
if (wstr == nullptr) {
return nullptr;
}
if (static_cast<size_t>(wlen) != std::wcslen(wstr)) {
Thread::current()->raiseWithFmt(LayoutId::kValueError,
"embedded null character");
PyMem_Free(wstr);
return nullptr;
}
char* str;
size_t error_pos;
const char* reason;
int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
/*current_locale=*/1, surrogateescape);
PyMem_Free(wstr);
if (res != 0) {
if (res == -2) {
PyObject* exc =
PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns", "locale",
unicode, error_pos, error_pos + 1, reason);
if (exc != nullptr) {
PyCodec_StrictErrors(exc);
Py_DECREF(exc);
}
} else {
PyErr_NoMemory();
}
return nullptr;
}
PyObject* bytes = PyBytes_FromString(str);
PyMem_RawFree(str);
return bytes;
}
PY_EXPORT PyObject* _PyUnicode_EncodeUTF16(PyObject* unicode,
const char* errors, int byteorder) {
DCHECK(unicode != nullptr, "unicode cannot be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object str(&scope, ApiHandle::fromPyObject(unicode)->asObject());
if (!runtime->isInstanceOfStr(*str)) {
thread->raiseBadArgument();
return nullptr;
}
Object errors_obj(&scope, symbolFromError(thread, errors));
Object byteorder_obj(&scope, runtime->newInt(byteorder));
Object tuple_obj(&scope,
thread->invokeFunction3(ID(_codecs), ID(utf_16_encode), str,
errors_obj, byteorder_obj));
if (tuple_obj.isError()) {
return nullptr;
}
Tuple tuple(&scope, *tuple_obj);
return ApiHandle::newReference(runtime, tuple.at(0));
}
PY_EXPORT PyObject* PyUnicode_EncodeUTF16(const Py_UNICODE* unicode,
Py_ssize_t size, const char* errors,
int byteorder) {
PyObject* str = PyUnicode_FromUnicode(unicode, size);
if (str == nullptr) return nullptr;
PyObject* result = _PyUnicode_EncodeUTF16(str, errors, byteorder);
Py_DECREF(str);
return result;
}
PY_EXPORT PyObject* _PyUnicode_EncodeUTF32(PyObject* unicode,
const char* errors, int byteorder) {
DCHECK(unicode != nullptr, "unicode cannot be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object str(&scope, ApiHandle::fromPyObject(unicode)->asObject());
if (!runtime->isInstanceOfStr(*str)) {
thread->raiseBadArgument();
return nullptr;
}
Object errors_obj(&scope, symbolFromError(thread, errors));
Object byteorder_obj(&scope, runtime->newInt(byteorder));
Object tuple_obj(&scope,
thread->invokeFunction3(ID(_codecs), ID(utf_32_encode), str,
errors_obj, byteorder_obj));
if (tuple_obj.isError()) {
return nullptr;
}
Tuple tuple(&scope, *tuple_obj);
return ApiHandle::newReference(runtime, tuple.at(0));
}
PY_EXPORT PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE* unicode,
Py_ssize_t size, const char* errors,
int byteorder) {
PyObject* str = PyUnicode_FromUnicode(unicode, size);
if (str == nullptr) return nullptr;
PyObject* result = _PyUnicode_EncodeUTF32(str, errors, byteorder);
Py_DECREF(str);
return result;
}
PY_EXPORT int PyUnicode_FSConverter(PyObject* arg, void* addr) {
if (arg == nullptr) {
Py_DECREF(*reinterpret_cast<PyObject**>(addr));
*reinterpret_cast<PyObject**>(addr) = nullptr;
return 1;
}
Thread* thread = Thread::current();
HandleScope scope(thread);
Object arg_obj(&scope, ApiHandle::fromPyObject(arg)->asObject());
Object path(&scope, NoneType::object());
Runtime* runtime = thread->runtime();
if (runtime->isInstanceOfStr(*arg_obj) ||
runtime->isInstanceOfBytes(*arg_obj)) {
path = *arg_obj;
} else {
path = thread->invokeFunction1(ID(_io), ID(_fspath), arg_obj);
if (path.isErrorException()) {
return 0;
}
}
Object output(&scope, NoneType::object());
if (runtime->isInstanceOfBytes(*path)) {
output = *path;
} else {
CHECK(std::strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0, "");
CHECK(std::strcmp(Py_FileSystemDefaultEncodeErrors, "surrogatepass") == 0,
"");
// PyOS_FSPath/_io._fspath guarantee their returned value is bytes or str.
// This is an inlined PyUnicode_FSDecoder, which does a UTF-8 decode with
// surrogatepass. Since our strings are UTF-8 with UTF-16 surrogates
// (WTF-8), we can just copy the bytes out.
Str path_str(&scope, strUnderlying(*path));
word path_len = path_str.length();
MutableBytes bytes(&scope, runtime->newMutableBytesUninitialized(path_len));
bytes.replaceFromWithStr(0, *path_str, path_len);
output = bytes.becomeImmutable();
}
Bytes underlying(&scope, bytesUnderlying(*output));
if (underlying.findByte('\0', /*start=*/0, /*length=*/underlying.length()) !=
-1) {
thread->raiseWithFmt(LayoutId::kValueError, "embedded null byte");
return 0;
}
*reinterpret_cast<PyObject**>(addr) =
ApiHandle::newReference(runtime, *output);
return Py_CLEANUP_SUPPORTED;
}
PY_EXPORT int PyUnicode_FSDecoder(PyObject* arg, void* addr) {
if (arg == nullptr) {
Py_DECREF(*(PyObject**)addr);
*reinterpret_cast<PyObject**>(addr) = nullptr;
return 1;
}
bool is_buffer = PyObject_CheckBuffer(arg);
PyObject* path;
if (!is_buffer) {
path = PyOS_FSPath(arg);
if (path == nullptr) return 0;
} else {
path = arg;
Py_INCREF(arg);
}
PyObject* output;
if (PyUnicode_Check(path)) {
output = path;
} else if (PyBytes_Check(path) || is_buffer) {
if (!PyBytes_Check(path) &&
PyErr_WarnFormat(
PyExc_DeprecationWarning, 1,
"path should be string, bytes, or os.PathLike, not %.200s",
PyObject_TypeName(arg))) {
Py_DECREF(path);
return 0;
}
PyObject* path_bytes = PyBytes_FromObject(path);
Py_DECREF(path);
if (!path_bytes) return 0;
output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
PyBytes_GET_SIZE(path_bytes));
Py_DECREF(path_bytes);
if (!output) return 0;
} else {
Thread::current()->raiseWithFmt(
LayoutId::kTypeError,
"path should be string, bytes, or os.PathLike, not %s",
PyObject_TypeName(arg));
Py_DECREF(path);
return 0;
}
Thread* thread = Thread::current();
HandleScope scope(thread);
Str output_str(&scope, ApiHandle::fromPyObject(output)->asObject());
if (strFindAsciiChar(output_str, '\0') >= 0) {
thread->raiseWithFmt(LayoutId::kValueError, "embedded null character");
Py_DECREF(output);
return 0;
}
*reinterpret_cast<PyObject**>(addr) = output;
return Py_CLEANUP_SUPPORTED;
}
PY_EXPORT Py_ssize_t PyUnicode_Find(PyObject* str, PyObject* substr,
Py_ssize_t start, Py_ssize_t end,
int direction) {
DCHECK(str != nullptr, "str must be non-null");
DCHECK(substr != nullptr, "substr must be non-null");
DCHECK(direction == -1 || direction == 1, "direction must be -1 or 1");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object haystack_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Object needle_obj(&scope, ApiHandle::fromPyObject(substr)->asObject());
Runtime* runtime = thread->runtime();
if (!runtime->isInstanceOfStr(*haystack_obj)) {
thread->raiseWithFmt(LayoutId::kTypeError,
"PyUnicode_Find requires a 'str' instance");
return -2;
}
Str haystack(&scope, strUnderlying(*haystack_obj));
if (!runtime->isInstanceOfStr(*needle_obj)) {
thread->raiseWithFmt(LayoutId::kTypeError,
"PyUnicode_Find requires a 'str' instance");
return -2;
}
Str needle(&scope, strUnderlying(*needle_obj));
if (direction == 1) return strFindWithRange(haystack, needle, start, end);
return strRFind(haystack, needle, start, end);
}
PY_EXPORT Py_ssize_t PyUnicode_FindChar(PyObject* str, Py_UCS4 ch,
Py_ssize_t start, Py_ssize_t end,
int direction) {
DCHECK(str != nullptr, "str must not be null");
DCHECK(direction == 1 || direction == -1, "direction must be -1 or 1");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object haystack_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Runtime* runtime = thread->runtime();
DCHECK(runtime->isInstanceOfStr(*haystack_obj),
"PyUnicode_FindChar requires a 'str' instance");
Str haystack(&scope, strUnderlying(*haystack_obj));
Str needle(&scope, SmallStr::fromCodePoint(ch));
if (direction == 1) return strFindWithRange(haystack, needle, start, end);
return strRFind(haystack, needle, start, end);
}
PY_EXPORT PyObject* PyUnicode_Format(PyObject* format, PyObject* args) {
if (format == nullptr || args == nullptr) {
PyErr_BadInternalCall();
return nullptr;
}
if (!PyUnicode_Check(format)) {
Thread::current()->raiseWithFmt(LayoutId::kTypeError, "must be str, not %s",
_PyType_Name(Py_TYPE(format)));
return nullptr;
}
return PyNumber_Remainder(format, args);
}
PY_EXPORT PyObject* PyUnicode_FromEncodedObject(PyObject* /* j */,
const char* /* g */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_FromEncodedObject");
}
PY_EXPORT PyObject* PyUnicode_FromFormat(const char* format, ...) {
va_list vargs;
va_start(vargs, format);
PyObject* ret = PyUnicode_FromFormatV(format, vargs);
va_end(vargs);
return ret;
}
PY_EXPORT PyObject* PyUnicode_FromFormatV(const char* format, va_list vargs) {
va_list vargs2;
_PyUnicodeWriter writer;
_PyUnicodeWriter_Init(&writer);
writer.min_length = std::strlen(format) + 100;
writer.overallocate = 1;
// This copy seems unnecessary but it may have been needed by CPython for
// historical reasons.
va_copy(vargs2, vargs);
for (const char* f = format; *f;) {
if (*f == '%') {
f = writeArg(&writer, f, &vargs2);
if (f == nullptr) goto fail;
} else {
const char* p = f;
do {
if (static_cast<unsigned char>(*p) > 127) {
PyErr_Format(
PyExc_ValueError,
"PyUnicode_FromFormatV() expects an ASCII-encoded format "
"string, got a non-ASCII byte: 0x%02x",
static_cast<unsigned char>(*p));
goto fail;
}
p++;
} while (*p != '\0' && *p != '%');
Py_ssize_t len = p - f;
if (*p == '\0') writer.overallocate = 0;
if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) goto fail;
f = p;
}
}
va_end(vargs2);
return _PyUnicodeWriter_Finish(&writer);
fail:
va_end(vargs2);
_PyUnicodeWriter_Dealloc(&writer);
return nullptr;
}
PY_EXPORT PyObject* PyUnicode_FromObject(PyObject* /* j */) {
UNIMPLEMENTED("PyUnicode_FromObject");
}
PY_EXPORT PyObject* PyUnicode_FromOrdinal(int ordinal) {
Thread* thread = Thread::current();
if (ordinal < 0 || ordinal > kMaxUnicode) {
thread->raiseWithFmt(LayoutId::kValueError,
"chr() arg not in range(0x110000)");
return nullptr;
}
return ApiHandle::newReference(thread->runtime(),
SmallStr::fromCodePoint(ordinal));
}
PY_EXPORT PyObject* PyUnicode_FromWideChar(const wchar_t* buffer,
Py_ssize_t size) {
Thread* thread = Thread::current();
if (buffer == nullptr && size != 0) {
thread->raiseBadInternalCall();
return nullptr;
}
RawObject result = size == -1
? newStrFromWideChar(thread, buffer)
: newStrFromWideCharWithLength(thread, buffer, size);
return result.isErrorException()
? nullptr
: ApiHandle::newReference(thread->runtime(), result);
}
PY_EXPORT Py_ssize_t PyUnicode_GET_LENGTH_Func(PyObject* pyobj) {
RawObject obj = ApiHandle::fromPyObject(pyobj)->asObjectNoImmediate();
DCHECK(Thread::current()->runtime()->isInstanceOfStr(obj),
"non-str argument to PyUnicode_GET_LENGTH");
return strUnderlying(obj).codePointLength();
}
PY_EXPORT const char* PyUnicode_GetDefaultEncoding() {
return Py_FileSystemDefaultEncoding;
}
PY_EXPORT Py_ssize_t PyUnicode_GetLength(PyObject* pyobj) {
Thread* thread = Thread::current();
RawObject obj = ApiHandle::fromPyObject(pyobj)->asObject();
if (!thread->runtime()->isInstanceOfStr(obj)) {
thread->raiseBadArgument();
return -1;
}
return strUnderlying(obj).codePointLength();
}
PY_EXPORT Py_ssize_t PyUnicode_GetSize(PyObject* pyobj) {
// This function returns the number of UTF-16 or UTF-32 code units, depending
// on the size of wchar_t on the operating system. On the machines that we
// currently use for testing, this is the same as the number of Unicode code
// points. This must be modified when we support operating systems with
// different wchar_t (e.g. Windows).
return PyUnicode_GetLength(pyobj);
}
PY_EXPORT PyObject* PyUnicode_InternFromString(const char* c_str) {
DCHECK(c_str != nullptr, "c_str must not be nullptr");
Thread* thread = Thread::current();
return ApiHandle::newReference(thread->runtime(),
Runtime::internStrFromCStr(thread, c_str));
}
PY_EXPORT void PyUnicode_InternImmortal(PyObject** /* p */) {
UNIMPLEMENTED("PyUnicode_InternImmortal");
}
PY_EXPORT void PyUnicode_InternInPlace(PyObject** obj_ptr) {
PyObject* pobj = *obj_ptr;
DCHECK(pobj != nullptr, "pobj should not be null");
if (pobj == nullptr) {
return;
}
Thread* thread = Thread::current();
HandleScope scope(thread);
Object obj(&scope, ApiHandle::fromPyObject(pobj)->asObject());
if (!obj.isLargeStr()) {
return;
}
Object result(&scope, Runtime::internStr(thread, obj));
if (result != obj) {
Py_DECREF(pobj);
*obj_ptr = ApiHandle::newReference(thread->runtime(), *result);
}
}
PY_EXPORT int PyUnicode_IsIdentifier(PyObject* str) {
DCHECK(str != nullptr, "str must not be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
if (str_obj == Str::empty()) {
return false;
}
Object result(&scope, thread->invokeMethodStatic1(LayoutId::kStr,
ID(isidentifier), str_obj));
DCHECK(!result.isErrorNotFound(), "could not call str.isidentifier");
CHECK(!result.isError(), "this function should not error");
return Bool::cast(*result).value();
}
PY_EXPORT PyObject* PyUnicode_Join(PyObject* sep, PyObject* seq) {
DCHECK(sep != nullptr, "sep should not be null");
DCHECK(seq != nullptr, "seq should not be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object sep_obj(&scope, ApiHandle::fromPyObject(sep)->asObject());
// An optimization to rule out non-str values here to use the further
// optimization of `strJoinWithTupleOrList`.
Runtime* runtime = thread->runtime();
if (!runtime->isInstanceOfStr(*sep_obj)) {
thread->raiseWithFmt(LayoutId::kTypeError,
"separator: expected str instance,"
"'%T' found",
&sep_obj);
return nullptr;
}
Str sep_str(&scope, strUnderlying(*sep_obj));
Object seq_obj(&scope, ApiHandle::fromPyObject(seq)->asObject());
// An ad-hoc optimization for the case `seq_obj` is a `tuple` or `list`,
// that can be removed without changing the correctness of PyUnicode_Join.
Object result(&scope, strJoinWithTupleOrList(thread, sep_str, seq_obj));
if (result.isUnbound()) {
result =
thread->invokeMethodStatic2(LayoutId::kStr, ID(join), sep_str, seq_obj);
}
if (result.isError()) {
if (result.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.join");
}
return nullptr;
}
return ApiHandle::newReference(runtime, *result);
}
PY_EXPORT PyObject* PyUnicode_Partition(PyObject* str, PyObject* sep) {
DCHECK(str != nullptr, "str should not be null");
DCHECK(sep != nullptr, "sep should not be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Object sep_obj(&scope, ApiHandle::fromPyObject(sep)->asObject());
Object result(&scope, thread->invokeMethodStatic2(
LayoutId::kStr, ID(partition), str_obj, sep_obj));
if (result.isError()) {
if (result.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kTypeError,
"could not call str.partition");
}
return nullptr;
}
return ApiHandle::newReference(thread->runtime(), *result);
}
PY_EXPORT PyObject* PyUnicode_RPartition(PyObject* str, PyObject* sep) {
DCHECK(str != nullptr, "str should not be null");
DCHECK(sep != nullptr, "sep should not be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Object sep_obj(&scope, ApiHandle::fromPyObject(sep)->asObject());
Object result(&scope, thread->invokeMethodStatic2(
LayoutId::kStr, ID(rpartition), str_obj, sep_obj));
if (result.isError()) {
if (result.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kTypeError,
"could not call str.rpartition");
}
return nullptr;
}
return ApiHandle::newReference(thread->runtime(), *result);
}
PY_EXPORT PyObject* PyUnicode_RSplit(PyObject* str, PyObject* sep,
Py_ssize_t maxsplit) {
DCHECK(str != nullptr, "str must not be null");
DCHECK(sep != nullptr, "sep must not be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Object sep_obj(&scope, ApiHandle::fromPyObject(sep)->asObject());
Runtime* runtime = thread->runtime();
Object maxsplit_obj(&scope, runtime->newInt(maxsplit));
Object result(&scope,
thread->invokeMethodStatic3(LayoutId::kStr, ID(rsplit), str_obj,
sep_obj, maxsplit_obj));
if (result.isError()) {
if (result.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.rsplit");
}
return nullptr;
}
return ApiHandle::newReference(runtime, *result);
}
PY_EXPORT Py_UCS4 PyUnicode_ReadChar(PyObject* obj, Py_ssize_t index) {
DCHECK(obj != nullptr, "obj must not be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object str_obj(&scope, ApiHandle::fromPyObject(obj)->asObject());
if (!runtime->isInstanceOfStr(*str_obj)) {
thread->raiseBadArgument();
return -1;
}
Str str(&scope, strUnderlying(*str_obj));
word byte_offset;
if (index < 0 ||
(byte_offset = thread->strOffset(str, index)) >= str.length()) {
thread->raiseWithFmt(LayoutId::kIndexError, "string index out of range");
return -1;
}
word num_bytes;
return str.codePointAt(byte_offset, &num_bytes);
}
PY_EXPORT PyObject* PyUnicode_Replace(PyObject* str, PyObject* substr,
PyObject* replstr, Py_ssize_t maxcount) {
DCHECK(str != nullptr, "str must not be null");
DCHECK(substr != nullptr, "substr must not be null");
DCHECK(replstr != nullptr, "replstr must not be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
if (!runtime->isInstanceOfStr(*str_obj)) {
thread->raiseWithFmt(LayoutId::kTypeError, "str must be str");
return nullptr;
}
Object substr_obj(&scope, ApiHandle::fromPyObject(substr)->asObject());
if (!runtime->isInstanceOfStr(*substr_obj)) {
thread->raiseWithFmt(LayoutId::kTypeError, "substr must be str");
return nullptr;
}
Object replstr_obj(&scope, ApiHandle::fromPyObject(replstr)->asObject());
if (!runtime->isInstanceOfStr(*replstr_obj)) {
thread->raiseWithFmt(LayoutId::kTypeError, "replstr must be str");
return nullptr;
}
Str str_str(&scope, strUnderlying(*str_obj));
Str substr_str(&scope, strUnderlying(*substr_obj));
Str replstr_str(&scope, strUnderlying(*replstr_obj));
return ApiHandle::newReference(
runtime,
runtime->strReplace(thread, str_str, substr_str, replstr_str, maxcount));
}
PY_EXPORT int PyUnicode_Resize(PyObject** /* p_unicode */, Py_ssize_t /* h */) {
UNIMPLEMENTED("PyUnicode_Resize");
}
PY_EXPORT PyObject* PyUnicode_RichCompare(PyObject* /* t */, PyObject* /* t */,
int /* p */) {
UNIMPLEMENTED("PyUnicode_RichCompare");
}
PY_EXPORT PyObject* PyUnicode_Split(PyObject* str, PyObject* sep,
Py_ssize_t maxsplit) {
DCHECK(str != nullptr, "str must not be null");
DCHECK(sep != nullptr, "sep must not be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Object sep_obj(&scope, ApiHandle::fromPyObject(sep)->asObject());
Runtime* runtime = thread->runtime();
Object maxsplit_obj(&scope, runtime->newInt(maxsplit));
Object result(&scope,
thread->invokeMethodStatic3(LayoutId::kStr, ID(split), str_obj,
sep_obj, maxsplit_obj));
if (result.isError()) {
if (result.isErrorNotFound()) {
thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.split");
}
return nullptr;
}
return ApiHandle::newReference(runtime, *result);
}
PY_EXPORT PyObject* PyUnicode_Splitlines(PyObject* str, int keepends) {
Thread* thread = Thread::current();
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Runtime* runtime = thread->runtime();
if (!runtime->isInstanceOfStr(*str_obj)) {
thread->raiseWithFmt(LayoutId::kTypeError, "must be str, not '%T'",
&str_obj);
return nullptr;
}
Str str_str(&scope, strUnderlying(*str_obj));
return ApiHandle::newReference(runtime,
strSplitlines(thread, str_str, keepends));
}
PY_EXPORT PyObject* PyUnicode_Substring(PyObject* pyobj, Py_ssize_t start,
Py_ssize_t end) {
DCHECK(pyobj != nullptr, "null argument to PyUnicode_Substring");
Thread* thread = Thread::current();
if (start < 0 || end < 0) {
thread->raiseWithFmt(LayoutId::kIndexError, "string index out of range");
return nullptr;
}
Runtime* runtime = thread->runtime();
if (end <= start) {
return ApiHandle::newReference(runtime, Str::empty());
}
HandleScope scope(thread);
ApiHandle* handle = ApiHandle::fromPyObject(pyobj);
Object obj(&scope, handle->asObject());
DCHECK(runtime->isInstanceOfStr(*obj),
"PyUnicode_Substring requires a 'str' instance");
Str self(&scope, strUnderlying(*obj));
word len = self.length();
word start_index = thread->strOffset(self, start);
if (start_index == len) {
return ApiHandle::newReference(runtime, Str::empty());
}
word end_index = thread->strOffset(self, end);
if (end_index == len) {
if (start_index == 0) {
handle->incref();
return pyobj;
}
}
return ApiHandle::newReference(
runtime, strSubstr(thread, self, start_index, end_index - start_index));
}
PY_EXPORT Py_ssize_t PyUnicode_Tailmatch(PyObject* str, PyObject* substr,
Py_ssize_t start, Py_ssize_t end,
int direction) {
DCHECK(str != nullptr, "str must be non-null");
DCHECK(substr != nullptr, "substr must be non-null");
DCHECK(direction == -1 || direction == 1, "direction must be -1 or 1");
Thread* thread = Thread::current();
HandleScope scope(thread);
Object haystack_obj(&scope, ApiHandle::fromPyObject(str)->asObject());
Object needle_obj(&scope, ApiHandle::fromPyObject(substr)->asObject());
Runtime* runtime = thread->runtime();
if (!runtime->isInstanceOfStr(*haystack_obj) ||
!runtime->isInstanceOfStr(*needle_obj)) {
thread->raiseBadArgument();
return -1;
}
Str haystack(&scope, strUnderlying(*haystack_obj));
Str needle(&scope, strUnderlying(*needle_obj));
word haystack_len = haystack.codePointLength();
Slice::adjustSearchIndices(&start, &end, haystack_len);
word needle_len = needle.codePointLength();
if (start + needle_len > end) {
return 0;
}
word start_offset;
if (direction == 1) {
start_offset = haystack.offsetByCodePoints(0, end - needle_len);
} else {
start_offset = haystack.offsetByCodePoints(0, start);
}
word needle_chars = needle.length();
for (word i = start_offset, j = 0; j < needle_chars; i++, j++) {
if (haystack.byteAt(i) != needle.byteAt(j)) {
return 0;
}
}
return 1;
}
PY_EXPORT PyObject* PyUnicode_Translate(PyObject* /* r */, PyObject* /* g */,
const char* /* s */) {
UNIMPLEMENTED("PyUnicode_Translate");
}
PY_EXPORT PyTypeObject* PyUnicode_Type_Ptr() {
Runtime* runtime = Thread::current()->runtime();
return reinterpret_cast<PyTypeObject*>(
ApiHandle::borrowedReference(runtime, runtime->typeAt(LayoutId::kStr)));
}
PY_EXPORT int PyUnicode_WriteChar(PyObject* /* e */, Py_ssize_t /* x */,
Py_UCS4 /* h */) {
UNIMPLEMENTED("PyUnicode_WriteChar");
}
PY_EXPORT Py_UNICODE* PyUnicode_AsUnicode(PyObject* /* e */) {
UNIMPLEMENTED("PyUnicode_AsUnicode");
}
PY_EXPORT Py_UNICODE* PyUnicode_AsUnicodeAndSize(PyObject* /* unicode */,
Py_ssize_t* /* size */) {
UNIMPLEMENTED("PyUnicode_AsUnicodeAndSize");
}
template <typename T>
static PyObject* decodeUnicodeToString(Thread* thread, const void* src,
word size) {
Runtime* runtime = thread->runtime();
DCHECK(src != nullptr, "Must pass in a non-null buffer");
const T* cp = static_cast<const T*>(src);
if (size == 1) {
return ApiHandle::newReference(runtime, SmallStr::fromCodePoint(cp[0]));
}
HandleScope scope(thread);
// TODO(T41785453): Remove the StrArray intermediary
StrArray array(&scope, runtime->newStrArray());
runtime->strArrayEnsureCapacity(thread, array, size);
for (word i = 0; i < size; ++i) {
runtime->strArrayAddCodePoint(thread, array, cp[i]);
}
return ApiHandle::newReference(runtime, runtime->strFromStrArray(array));
}
PY_EXPORT PyObject* PyUnicode_FromKindAndData(int kind, const void* buffer,
Py_ssize_t size) {
Thread* thread = Thread::current();
if (size < 0) {
thread->raiseWithFmt(LayoutId::kValueError, "size must be positive");
return nullptr;
}
if (size == 0) {
return ApiHandle::newReference(thread->runtime(), Str::empty());
}
switch (kind) {
case PyUnicode_1BYTE_KIND:
return decodeUnicodeToString<Py_UCS1>(thread, buffer, size);
case PyUnicode_2BYTE_KIND:
return decodeUnicodeToString<Py_UCS2>(thread, buffer, size);
case PyUnicode_4BYTE_KIND:
return decodeUnicodeToString<Py_UCS4>(thread, buffer, size);
}
thread->raiseWithFmt(LayoutId::kSystemError, "invalid kind");
return nullptr;
}
PY_EXPORT PyObject* PyUnicode_FromUnicode(const Py_UNICODE* code_units,
Py_ssize_t size) {
if (code_units == nullptr) {
// TODO(T36562134): Implement _PyUnicode_New
UNIMPLEMENTED("_PyUnicode_New");
}
Thread* thread = Thread::current();
RawObject result = newStrFromWideCharWithLength(thread, code_units, size);
return result.isErrorException()
? nullptr
: ApiHandle::newReference(thread->runtime(), result);
}
PY_EXPORT int PyUnicode_KIND_Func(PyObject* obj) {
// TODO(T47682853): Introduce new PyUnicode_VARBYTE_KIND
CHECK(PyUnicode_IS_ASCII_Func(obj), "only ASCII allowed");
return PyUnicode_1BYTE_KIND;
}
// NOTE: This will return a cached and managed C-string buffer that is a copy
// of the Str internal buffer. It is NOT a direct pointer into the string
// object, so writing into this buffer will do nothing. This is different
// behavior from CPython, where changing the data in the buffer changes the
// string object.
PY_EXPORT void* PyUnicode_DATA_Func(PyObject* str) {
Thread* thread = Thread::current();
Runtime* runtime = thread->runtime();
ApiHandle* handle = ApiHandle::fromPyObject(str);
if (void* cache = handle->cache(runtime)) {
return static_cast<char*>(cache);
}
HandleScope scope(thread);
Object obj(&scope, handle->asObject());
DCHECK(runtime->isInstanceOfStr(*obj), "str should be a str instance");
Str str_obj(&scope, strUnderlying(*obj));
word length = str_obj.length();
byte* result = static_cast<byte*>(std::malloc(length + 1));
str_obj.copyTo(result, length);
result[length] = '\0';
handle->setCache(runtime, result);
handle->setBorrowedNoImmediate();
return reinterpret_cast<char*>(result);
}
PY_EXPORT Py_UCS4 PyUnicode_READ_Func(int kind, void* data, Py_ssize_t index) {
if (kind == PyUnicode_1BYTE_KIND) return static_cast<Py_UCS1*>(data)[index];
if (kind == PyUnicode_2BYTE_KIND) return static_cast<Py_UCS2*>(data)[index];
DCHECK(kind == PyUnicode_4BYTE_KIND, "kind must be PyUnicode_4BYTE_KIND");
return static_cast<Py_UCS4*>(data)[index];
}
PY_EXPORT Py_UCS4 PyUnicode_READ_CHAR_Func(PyObject* obj, Py_ssize_t index) {
Thread* thread = Thread::current();
HandleScope scope(thread);
Object str_obj(&scope, ApiHandle::fromPyObject(obj)->asObject());
DCHECK(thread->runtime()->isInstanceOfStr(*str_obj),
"PyUnicode_READ_CHAR must receive a unicode object");
Str str(&scope, strUnderlying(*str_obj));
word byte_offset = thread->strOffset(str, index);
if (byte_offset == str.length()) return Py_UCS4{0};
word num_bytes;
return static_cast<Py_UCS4>(str.codePointAt(byte_offset, &num_bytes));
}
PY_EXPORT int PyUnicode_IS_ASCII_Func(PyObject* obj) {
Thread* thread = Thread::current();
HandleScope scope(thread);
Object str(&scope, ApiHandle::fromPyObject(obj)->asObject());
DCHECK(thread->runtime()->isInstanceOfStr(*str),
"strIsASCII must receive a unicode object");
return strUnderlying(*str).isASCII() ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISALPHA_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isAlpha(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISDECIMAL_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isDecimal(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISDIGIT_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isDigit(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISLINEBREAK_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isLinebreak(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISLOWER_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isLower(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISNUMERIC_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isNumeric(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISPRINTABLE_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isPrintable(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISSPACE_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isSpace(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISTITLE_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isTitle(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_ISUPPER_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return 0;
}
return Unicode::isUpper(static_cast<int32_t>(code_point)) ? 1 : 0;
}
PY_EXPORT int Py_UNICODE_TODECIMAL_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return -1;
}
return Unicode::toDecimal(static_cast<int32_t>(code_point));
}
PY_EXPORT int Py_UNICODE_TODIGIT_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return -1;
}
return Unicode::toDigit(static_cast<int32_t>(code_point));
}
PY_EXPORT Py_UCS4 Py_UNICODE_TOLOWER_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return code_point;
}
FullCasing lower = Unicode::toLower(static_cast<int32_t>(code_point));
return lower.code_points[0];
}
PY_EXPORT double Py_UNICODE_TONUMERIC_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return -1.0;
}
return Unicode::toNumeric(static_cast<int32_t>(code_point));
}
PY_EXPORT Py_UCS4 Py_UNICODE_TOTITLE_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return code_point;
}
FullCasing title = Unicode::toTitle(static_cast<int32_t>(code_point));
return title.code_points[0];
}
PY_EXPORT Py_UCS4 Py_UNICODE_TOUPPER_Func(Py_UCS4 code_point) {
if (code_point > kMaxUnicode) {
return code_point;
}
FullCasing upper = Unicode::toUpper(static_cast<int32_t>(code_point));
return upper.code_points[0];
}
PY_EXPORT int _Py_normalize_encoding(const char* encoding, char* lower,
size_t lower_len) {
char* buffer = lower;
const char* lower_end = &lower[lower_len - 1];
bool has_punct = false;
for (char ch = *encoding; ch != '\0'; ch = *++encoding) {
if (Py_ISALNUM(ch) || ch == '.') {
if (has_punct && buffer != lower) {
if (buffer == lower_end) {
return 0;
}
*buffer++ = '_';
}
has_punct = false;
if (buffer == lower_end) {
return 0;
}
*buffer++ = Py_TOLOWER(ch);
} else {
has_punct = true;
}
}
*buffer = '\0';
return 1;
}
PY_EXPORT PyObject* _PyUnicode_AsUTF8String(PyObject* unicode,
const char* errors) {
DCHECK(unicode != nullptr, "unicode cannot be null");
Thread* thread = Thread::current();
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object obj(&scope, ApiHandle::fromPyObject(unicode)->asObject());
if (!runtime->isInstanceOfStr(*obj)) {
thread->raiseBadArgument();
return nullptr;
}
Str str(&scope, strUnderlying(*obj));
if (!strHasSurrogate(str)) {
word length = str.length();
MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length));
result.replaceFromWithStr(0, *str, length);
return ApiHandle::newReference(runtime, result.becomeImmutable());
}
Object errors_obj(&scope, symbolFromError(thread, errors));
Object tuple_obj(&scope, thread->invokeFunction2(
ID(_codecs), ID(utf_8_encode), str, errors_obj));
if (tuple_obj.isError()) {
return nullptr;
}
Tuple tuple(&scope, *tuple_obj);
return ApiHandle::newReference(runtime, tuple.at(0));
}
PY_EXPORT wchar_t* _Py_DecodeUTF8_surrogateescape(const char* c_str,
Py_ssize_t size,
size_t* wlen) {
DCHECK(c_str != nullptr, "c_str cannot be null");
wchar_t* wc_str =
static_cast<wchar_t*>(PyMem_RawMalloc((size + 1) * sizeof(wchar_t)));
for (Py_ssize_t i = 0; i < size; i++) {
char ch = c_str[i];
// TODO(T57811636): Support UTF-8 arguments on macOS.
// We don't have UTF-8 decoding machinery that is decoupled from the
// runtime
if (ch & 0x80) {
UNIMPLEMENTED("UTF-8 argument support unimplemented");
}
wc_str[i] = static_cast<wchar_t>(ch);
}
wc_str[size] = '\0';
if (wlen != nullptr) {
*wlen = size;
}
return wc_str;
}
PY_EXPORT int _Py_DecodeUTF8Ex(const char* c_str, Py_ssize_t size,
wchar_t** result, size_t* wlen,
const char** /* reason */,
_Py_error_handler /* surrogateescape */) {
wchar_t* wc_str =
static_cast<wchar_t*>(PyMem_RawMalloc((size + 1) * sizeof(*wc_str)));
if (wc_str == nullptr) {
return -1;
}
for (Py_ssize_t i = 0; i < size; i++) {
byte ch = c_str[i];
// TODO(T57811636): Support UTF-8 decoding decoupled from the runtime.
// We don't have UTF-8 decoding machinery that is decoupled from the
// runtime
if (ch > kMaxASCII) {
UNIMPLEMENTED("UTF-8 argument support unimplemented");
}
wc_str[i] = ch;
}
wc_str[size] = '\0';
*result = wc_str;
if (wlen) {
*wlen = size;
}
return 0;
}
// UTF-8 encoder using the surrogateescape error handler .
//
// On success, return 0 and write the newly allocated character string (use
// PyMem_Free() to free the memory) into *str.
//
// On encoding failure, return -2 and write the position of the invalid
// surrogate character into *error_pos (if error_pos is set) and the decoding
// error message into *reason (if reason is set).
//
// On memory allocation failure, return -1.
PY_EXPORT int _Py_EncodeUTF8Ex(const wchar_t* text, char** str,
size_t* error_pos, const char** reason,
int raw_malloc, _Py_error_handler errors) {
const Py_ssize_t max_char_size = 4;
Py_ssize_t len = std::wcslen(text);
DCHECK(len >= 0, "len must be non-negative");
bool surrogateescape = false;
bool surrogatepass = false;
switch (errors) {
case _Py_ERROR_STRICT:
break;
case _Py_ERROR_SURROGATEESCAPE:
surrogateescape = true;
break;
case _Py_ERROR_SURROGATEPASS:
surrogatepass = true;
break;
default:
return -3;
}
if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
return -1;
}
char* bytes;
if (raw_malloc) {
bytes = reinterpret_cast<char*>(PyMem_RawMalloc((len + 1) * max_char_size));
} else {
bytes = reinterpret_cast<char*>(PyMem_Malloc((len + 1) * max_char_size));
}
if (bytes == nullptr) {
return -1;
}
char* p = bytes;
for (Py_ssize_t i = 0; i < len; i++) {
Py_UCS4 ch = text[i];
if (ch < 0x80) {
// Encode ASCII
*p++ = (char)ch;
} else if (ch < 0x0800) {
// Encode Latin-1
*p++ = (char)(0xc0 | (ch >> 6));
*p++ = (char)(0x80 | (ch & 0x3f));
} else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
// surrogateescape error handler
if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
if (error_pos != nullptr) {
*error_pos = (size_t)i;
}
if (reason != nullptr) {
*reason = "encoding error";
}
if (raw_malloc) {
PyMem_RawFree(bytes);
} else {
PyMem_Free(bytes);
}
return -2;
}
*p++ = (char)(ch & 0xff);
} else if (ch < 0x10000) {
*p++ = (char)(0xe0 | (ch >> 12));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
} else {
// ch >= 0x10000
DCHECK(ch <= kMaxUnicode, "ch must be a valid unicode code point");
// Encode UCS4 Unicode ordinals
*p++ = (char)(0xf0 | (ch >> 18));
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
}
}
*p++ = '\0';
size_t final_size = (p - bytes);
char* bytes2;
if (raw_malloc) {
bytes2 = reinterpret_cast<char*>(PyMem_RawRealloc(bytes, final_size));
} else {
bytes2 = reinterpret_cast<char*>(PyMem_Realloc(bytes, final_size));
}
if (bytes2 == nullptr) {
if (error_pos != nullptr) {
*error_pos = (size_t)-1;
}
if (raw_malloc) {
PyMem_RawFree(bytes);
} else {
PyMem_Free(bytes);
}
return -1;
}
*str = bytes2;
return 0;
}
} // namespace py