5#ifndef INCLUDE_GUARD_ARENE_BASE_ARENE_BASE_STRINGS_UTF_UTF8_VALIDATION_HPP_
6#define INCLUDE_GUARD_ARENE_BASE_ARENE_BASE_STRINGS_UTF_UTF8_VALIDATION_HPP_
9#include "arene/base/byte/byte.hpp"
10#include "arene/base/contracts/contract.hpp"
11#include "arene/base/stdlib_choice/cstdint.hpp"
12#include "arene/base/strings/string_view.hpp"
20namespace utf8_validation_detail {
25 static constexpr byte min_continuation_byte{to_byte(std::uint8_t{0X80})};
29 static constexpr byte max_continuation_byte{to_byte(std::uint8_t{0XBF})};
33 static constexpr byte two_byte_initial_smallest{to_byte(std::uint8_t{0XC2})};
36 static constexpr byte three_byte_min_initial_byte{to_byte(std::uint8_t{0XE0})};
40 static constexpr byte three_byte_min_initial_not_overlong{to_byte(std::uint8_t{0XE1})};
44 static constexpr byte three_byte_initial_surrogate_byte{to_byte(std::uint8_t{0XED})};
48 static constexpr byte three_byte_min_valid_second_not_overlong_byte{to_byte(std::uint8_t{0XA0})};
52 static constexpr byte three_byte_min_second_surrogate_byte{to_byte(std::uint8_t{0XA0})};
55 static constexpr byte four_byte_initial_marker{to_byte(std::uint8_t{0XF0})};
57 static constexpr byte four_byte_initial_low_marker{to_byte(std::uint8_t{0XF0})};
59 static constexpr byte four_byte_initial_high_byte{to_byte(std::uint8_t{0XF4})};
61 static constexpr byte maximum_initial_byte{to_byte(std::uint8_t{0XF4})};
64 static constexpr byte four_byte_low_min_valid_second_byte{to_byte(std::uint8_t{0X90})};
68 enum class state_type : std::uint8_t {
71 three_byte_second_high,
72 three_byte_second_surrogate,
73 three_byte_second_low,
76 four_byte_second_high,
81 state_type state_{state_type::initial_byte};
89 constexpr auto check_initial_byte(byte
const value)
noexcept ->
bool;
98 constexpr auto check_next_byte(byte
const value)
noexcept ->
bool;
109 constexpr auto is_valid_end()
const noexcept ->
bool {
return state_ == state_type::initial_byte; }
121constexpr auto validator::check_initial_byte(byte
const value)
noexcept ->
bool {
122 if (value > maximum_initial_byte) {
125 if (value == four_byte_initial_high_byte) {
126 state_ = state_type::four_byte_second_high;
127 }
else if (value == four_byte_initial_low_marker) {
128 state_ = state_type::four_byte_second_low;
129 }
else if (value >= four_byte_initial_marker) {
130 state_ = state_type::four_byte_second;
131 }
else if (value == three_byte_initial_surrogate_byte) {
132 state_ = state_type::three_byte_second_surrogate;
133 }
else if (value >= three_byte_min_initial_not_overlong) {
134 state_ = state_type::three_byte_second_high;
135 }
else if (value >= three_byte_min_initial_byte) {
136 state_ = state_type::three_byte_second_low;
137 }
else if (value >= two_byte_initial_smallest) {
138 state_ = state_type::final_byte;
139 }
else if (value >= min_continuation_byte) {
152constexpr auto validator::check_next_byte(byte
const value)
noexcept ->
bool {
153 if ((state_ != state_type::initial_byte) && ((value < min_continuation_byte) || (value > max_continuation_byte))) {
165 case state_type::initial_byte:
166 return check_initial_byte(value);
167 case state_type::three_byte_second_low:
168 if (value < three_byte_min_valid_second_not_overlong_byte) {
171 state_ = state_type::final_byte;
173 case state_type::three_byte_second_surrogate:
174 if (value >= three_byte_min_second_surrogate_byte) {
178 case state_type::three_byte_second_high:
179 case state_type::four_byte_third:
180 state_ = state_type::final_byte;
182 case state_type::four_byte_second_high:
183 if (value >= four_byte_low_min_valid_second_byte) {
186 state_ = state_type::four_byte_third;
188 case state_type::four_byte_second_low:
189 if (value < four_byte_low_min_valid_second_byte) {
193 case state_type::four_byte_second:
194 state_ = state_type::four_byte_third;
196 case state_type::final_byte:
197 state_ = state_type::initial_byte;
200 ARENE_INVARIANT_UNREACHABLE(
"Corrupt UTF-8 validator");
224 utf8_validation_detail::validator validator;
226 for (
char const chr : str) {
227 if (!validator.check_next_byte(to_byte(
static_cast<
unsigned char>(chr)))) {
232 return validator.is_valid_end();
Definition array_exceptions_disabled.cpp:11
constexpr auto is_valid_utf8(string_view str) noexcept -> bool
Check if a string is valid UTF-8.
Definition utf8_validation.hpp:223
Copyright 2026, Toyota Motor Corporation.
Definition array_exceptions_disabled.cpp:10