Skip to content

Commit

Permalink
Adds support for wchar_t/wchar_t*/std::wstring{_view} arguments to St…
Browse files Browse the repository at this point in the history
…rFormat().

This converts to UTF-8 regardless of locale.

PiperOrigin-RevId: 588186076
Change-Id: I2c9598279b413d460e13ad65da2ba421c0b40b83
  • Loading branch information
Abseil Team authored and copybara-github committed Dec 5, 2023
1 parent 3e6ecec commit 5dc2cc1
Show file tree
Hide file tree
Showing 14 changed files with 608 additions and 107 deletions.
12 changes: 12 additions & 0 deletions absl/strings/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -1247,6 +1247,10 @@ cc_library(
linkopts = ABSL_DEFAULT_LINKOPTS,
deps = [
":str_format_internal",
":string_view",
"//absl/base:config",
"//absl/base:core_headers",
"//absl/types:span",
],
)

Expand Down Expand Up @@ -1277,6 +1281,7 @@ cc_library(
":strings",
"//absl/base:config",
"//absl/base:core_headers",
"//absl/container:fixed_array",
"//absl/container:inlined_vector",
"//absl/functional:function_ref",
"//absl/meta:type_traits",
Expand Down Expand Up @@ -1330,6 +1335,7 @@ cc_test(
deps = [
":str_format",
":str_format_internal",
"//absl/base:config",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
Expand Down Expand Up @@ -1366,12 +1372,16 @@ cc_test(
copts = ABSL_TEST_COPTS,
visibility = ["//visibility:private"],
deps = [
":str_format",
":str_format_internal",
":strings",
"//absl/base:config",
"//absl/base:core_headers",
"//absl/base:raw_logging_internal",
"//absl/log",
"//absl/numeric:int128",
"//absl/types:optional",
"//absl/types:span",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
],
Expand All @@ -1397,6 +1407,8 @@ cc_test(
visibility = ["//visibility:private"],
deps = [
":str_format_internal",
":string_view",
"//absl/base:config",
"//absl/base:core_headers",
"@com_google_googletest//:gtest",
"@com_google_googletest//:gtest_main",
Expand Down
17 changes: 14 additions & 3 deletions absl/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,11 @@ absl_cc_library(
COPTS
${ABSL_DEFAULT_COPTS}
DEPS
absl::config
absl::core_headers
absl::span
absl::str_format_internal
absl::string_view
PUBLIC
)

Expand Down Expand Up @@ -501,6 +505,7 @@ absl_cc_library(
absl::strings
absl::config
absl::core_headers
absl::fixed_array
absl::inlined_vector
absl::numeric_representation
absl::type_traits
Expand Down Expand Up @@ -548,6 +553,7 @@ absl_cc_test(
COPTS
${ABSL_TEST_COPTS}
DEPS
absl::config
absl::str_format
absl::str_format_internal
GTest::gmock_main
Expand Down Expand Up @@ -585,12 +591,15 @@ absl_cc_test(
COPTS
${ABSL_TEST_COPTS}
DEPS
absl::strings
absl::str_format_internal
absl::config
absl::core_headers
absl::int128
absl::log
absl::raw_logging_internal
absl::int128
absl::span
absl::str_format
absl::str_format_internal
absl::strings
GTest::gmock_main
)

Expand All @@ -616,6 +625,8 @@ absl_cc_test(
${ABSL_TEST_COPTS}
DEPS
absl::str_format_internal
absl::string_view
absl::config
absl::core_headers
GTest::gmock_main
)
Expand Down
153 changes: 145 additions & 8 deletions absl/strings/internal/str_format/arg.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,28 @@
//
#include "absl/strings/internal/str_format/arg.h"

#include <algorithm>
#include <cassert>
#include <cerrno>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <cwchar>
#include <string>
#include <type_traits>

#include "absl/base/port.h"
#include "absl/base/config.h"
#include "absl/base/optimization.h"
#include "absl/container/fixed_array.h"
#include "absl/numeric/int128.h"
#include "absl/strings/internal/str_format/extension.h"
#include "absl/strings/internal/str_format/float_conversion.h"
#include "absl/strings/numbers.h"
#include "absl/strings/string_view.h"

#if defined(ABSL_HAVE_STD_STRING_VIEW)
#include <string_view>
#endif

namespace absl {
ABSL_NAMESPACE_BEGIN
Expand Down Expand Up @@ -298,6 +311,83 @@ inline bool ConvertStringArg(string_view v, const FormatConversionSpecImpl conv,
conv.has_left_flag());
}

struct ShiftState {
bool saw_high_surrogate = false;
uint8_t bits = 0;
};

// Converts `v` from UTF-16 or UTF-32 to UTF-8 and writes to `buf`. `buf` is
// assumed to have enough space for the output. `s` is used to carry state
// between successive calls with a UTF-16 surrogate pair. Returns the number of
// chars written, or `static_cast<size_t>(-1)` on failure.
//
// This is basically std::wcrtomb(), but always outputting UTF-8 instead of
// respecting the current locale.
inline size_t WideToUtf8(wchar_t wc, char *buf, ShiftState &s) {
const auto v = static_cast<uint32_t>(wc);
if (v < 0x80) {
*buf = static_cast<char>(v);
return 1;
} else if (v < 0x800) {
*buf++ = static_cast<char>(0xc0 | (v >> 6));
*buf = static_cast<char>(0x80 | (v & 0x3f));
return 2;
} else if (v < 0xd800 || (v - 0xe000) < 0x2000) {
*buf++ = static_cast<char>(0xe0 | (v >> 12));
*buf++ = static_cast<char>(0x80 | ((v >> 6) & 0x3f));
*buf = static_cast<char>(0x80 | (v & 0x3f));
return 3;
} else if ((v - 0x10000) < 0x100000) {
*buf++ = static_cast<char>(0xf0 | (v >> 18));
*buf++ = static_cast<char>(0x80 | ((v >> 12) & 0x3f));
*buf++ = static_cast<char>(0x80 | ((v >> 6) & 0x3f));
*buf = static_cast<char>(0x80 | (v & 0x3f));
return 4;
} else if (v < 0xdc00) {
s.saw_high_surrogate = true;
s.bits = static_cast<uint8_t>(v & 0x3);
const uint8_t high_bits = ((v >> 6) & 0xf) + 1;
*buf++ = static_cast<char>(0xf0 | (high_bits >> 2));
*buf =
static_cast<char>(0x80 | static_cast<uint8_t>((high_bits & 0x3) << 4) |
static_cast<uint8_t>((v >> 2) & 0xf));
return 2;
} else if (v < 0xe000 && s.saw_high_surrogate) {
*buf++ = static_cast<char>(0x80 | static_cast<uint8_t>(s.bits << 4) |
static_cast<uint8_t>((v >> 6) & 0xf));
*buf = static_cast<char>(0x80 | (v & 0x3f));
s.saw_high_surrogate = false;
s.bits = 0;
return 2;
} else {
return static_cast<size_t>(-1);
}
}

inline bool ConvertStringArg(const wchar_t *v,
size_t len,
const FormatConversionSpecImpl conv,
FormatSinkImpl *sink) {
FixedArray<char> mb(len * 4);
ShiftState s;
size_t chars_written = 0;
for (size_t i = 0; i < len; ++i) {
const size_t chars = WideToUtf8(v[i], &mb[chars_written], s);
if (chars == static_cast<size_t>(-1)) { return false; }
chars_written += chars;
}
return ConvertStringArg(string_view(mb.data(), chars_written), conv, sink);
}

bool ConvertWCharTImpl(wchar_t v, const FormatConversionSpecImpl conv,
FormatSinkImpl *sink) {
char mb[4];
ShiftState s;
const size_t chars_written = WideToUtf8(v, mb, s);
return chars_written != static_cast<size_t>(-1) && !s.saw_high_surrogate &&
ConvertStringArg(string_view(mb, chars_written), conv, sink);
}

} // namespace

bool ConvertBoolArg(bool v, FormatSinkImpl *sink) {
Expand All @@ -316,11 +406,14 @@ bool ConvertIntArg(T v, FormatConversionSpecImpl conv, FormatSinkImpl *sink) {

// This odd casting is due to a bug in -Wswitch behavior in gcc49 which causes
// it to complain about a switch/case type mismatch, even though both are
// FormatConverionChar. Likely this is because at this point
// FormatConversionChar. Likely this is because at this point
// FormatConversionChar is declared, but not defined.
switch (static_cast<uint8_t>(conv.conversion_char())) {
case static_cast<uint8_t>(FormatConversionCharInternal::c):
return ConvertCharImpl(static_cast<char>(v), conv, sink);
return (std::is_same<T, wchar_t>::value ||
(conv.length_mod() == LengthMod::l))
? ConvertWCharTImpl(static_cast<wchar_t>(v), conv, sink)
: ConvertCharImpl(static_cast<char>(v), conv, sink);

case static_cast<uint8_t>(FormatConversionCharInternal::o):
as_digits.PrintAsOct(static_cast<U>(v));
Expand Down Expand Up @@ -372,6 +465,8 @@ template bool ConvertIntArg<signed char>(signed char v,
template bool ConvertIntArg<unsigned char>(unsigned char v,
FormatConversionSpecImpl conv,
FormatSinkImpl *sink);
template bool ConvertIntArg<wchar_t>(wchar_t v, FormatConversionSpecImpl conv,
FormatSinkImpl *sink);
template bool ConvertIntArg<short>(short v, // NOLINT
FormatConversionSpecImpl conv,
FormatSinkImpl *sink);
Expand Down Expand Up @@ -403,16 +498,29 @@ StringConvertResult FormatConvertImpl(const std::string &v,
return {ConvertStringArg(v, conv, sink)};
}

StringConvertResult FormatConvertImpl(const std::wstring &v,
const FormatConversionSpecImpl conv,
FormatSinkImpl *sink) {
return {ConvertStringArg(v.data(), v.size(), conv, sink)};
}

StringConvertResult FormatConvertImpl(string_view v,
const FormatConversionSpecImpl conv,
FormatSinkImpl *sink) {
return {ConvertStringArg(v, conv, sink)};
}

ArgConvertResult<FormatConversionCharSetUnion(
FormatConversionCharSetInternal::s, FormatConversionCharSetInternal::p)>
FormatConvertImpl(const char *v, const FormatConversionSpecImpl conv,
FormatSinkImpl *sink) {
#if defined(ABSL_HAVE_STD_STRING_VIEW)
StringConvertResult FormatConvertImpl(std::wstring_view v,
const FormatConversionSpecImpl conv,
FormatSinkImpl* sink) {
return {ConvertStringArg(v.data(), v.size(), conv, sink)};
}
#endif

StringPtrConvertResult FormatConvertImpl(const char* v,
const FormatConversionSpecImpl conv,
FormatSinkImpl* sink) {
if (conv.conversion_char() == FormatConversionCharInternal::p)
return {FormatConvertImpl(VoidPtr(v), conv, sink).value};
size_t len;
Expand All @@ -427,6 +535,30 @@ FormatConvertImpl(const char *v, const FormatConversionSpecImpl conv,
return {ConvertStringArg(string_view(v, len), conv, sink)};
}

StringPtrConvertResult FormatConvertImpl(const wchar_t* v,
const FormatConversionSpecImpl conv,
FormatSinkImpl* sink) {
if (conv.conversion_char() == FormatConversionCharInternal::p) {
return {FormatConvertImpl(VoidPtr(v), conv, sink).value};
}
size_t len;
if (v == nullptr) {
len = 0;
} else if (conv.precision() < 0) {
len = std::wcslen(v);
} else {
// If precision is set, we look for the NUL-terminator on the valid range.
len = static_cast<size_t>(std::find(v, v + conv.precision(), L'\0') - v);
}
return {ConvertStringArg(v, len, conv, sink)};
}

StringPtrConvertResult FormatConvertImpl(std::nullptr_t,
const FormatConversionSpecImpl conv,
FormatSinkImpl* sink) {
return FormatConvertImpl(static_cast<const char*>(nullptr), conv, sink);
}

// ==================== Raw pointers ====================
ArgConvertResult<FormatConversionCharSetInternal::p> FormatConvertImpl(
VoidPtr v, const FormatConversionSpecImpl conv, FormatSinkImpl *sink) {
Expand Down Expand Up @@ -461,6 +593,11 @@ CharConvertResult FormatConvertImpl(char v, const FormatConversionSpecImpl conv,
FormatSinkImpl *sink) {
return {ConvertIntArg(v, conv, sink)};
}
CharConvertResult FormatConvertImpl(wchar_t v,
const FormatConversionSpecImpl conv,
FormatSinkImpl* sink) {
return {ConvertIntArg(v, conv, sink)};
}

// ==================== Ints ====================
IntegralConvertResult FormatConvertImpl(signed char v,
Expand Down
Loading

0 comments on commit 5dc2cc1

Please sign in to comment.