Arene Base
Fundamental Utilities For Safety Critical C++
Loading...
Searching...
No Matches
utf8_validation.hpp
Go to the documentation of this file.
1// Copyright 2024, Toyota Motor Corporation
2//
3// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
5#ifndef INCLUDE_GUARD_ARENE_BASE_ARENE_BASE_STRINGS_UTF_UTF8_VALIDATION_HPP_
6#define INCLUDE_GUARD_ARENE_BASE_ARENE_BASE_STRINGS_UTF_UTF8_VALIDATION_HPP_
7
8// parasoft-begin-suppress AUTOSAR-A16_2_2-a-2 "Arene Base aggregate headers permitted by A16-2-2 Permit #1"
9#include "arene/base/byte/byte.hpp"
10#include "arene/base/contracts/contract.hpp"
11#include "arene/base/stdlib_choice/cstdint.hpp"
12#include "arene/base/strings/string_view.hpp"
13// parasoft-end-suppress AUTOSAR-A16_2_2-a-2
14
15// parasoft-begin-suppress AUTOSAR-A7_1_5-a-2 "Trailing return syntax permitted by A7-1-5 Permit #1 v1.0.0"
16
17namespace arene {
18namespace base {
19
20namespace utf8_validation_detail {
21/// @brief Class to implement a state machine for UTF-8 validation
22class validator {
23 // parasoft-begin-suppress AUTOSAR-A5_1_1-a "False positive: these literals constants are used"
24 /// @brief The minimum value of a UTF-8 continuation byte
25 static constexpr byte min_continuation_byte{to_byte(std::uint8_t{0X80})};
26
27 // parasoft-begin-suppress AUTOSAR-M0_1_3-c "False positive: 'max_continuation_byte' is used"
28 /// @brief The maximum value of a UTF-8 continuation byte
29 static constexpr byte max_continuation_byte{to_byte(std::uint8_t{0XBF})};
30 // parasoft-end-suppress AUTOSAR-M0_1_3-c
31
32 /// @brief The minimum value of a UTF-8 lead byte indicating a valid 2-byte sequence
33 static constexpr byte two_byte_initial_smallest{to_byte(std::uint8_t{0XC2})};
34
35 /// @brief The minimum value of a UTF-8 lead byte indicating a 3-byte sequence
36 static constexpr byte three_byte_min_initial_byte{to_byte(std::uint8_t{0XE0})};
37
38 /// @brief The minimum value of a UTF-8 lead byte indicating a 3-byte sequence high enough to not be an over-long
39 /// encoding
40 static constexpr byte three_byte_min_initial_not_overlong{to_byte(std::uint8_t{0XE1})};
41
42 /// @brief The minimum value of a UTF-8 lead byte indicating a 3-byte sequence that could be the start of an encoding
43 /// of a UTF-16 surrogate pair (which is invalid UTF-8)
44 static constexpr byte three_byte_initial_surrogate_byte{to_byte(std::uint8_t{0XED})};
45
46 /// @brief The minimum value of the second byte of a 3-byte UTF-8 encoding starting with 0XE0 for it not to be an
47 /// over-long encoding
48 static constexpr byte three_byte_min_valid_second_not_overlong_byte{to_byte(std::uint8_t{0XA0})};
49
50 /// @brief The minimum valid of the second byte of a 3-byte UTF-8 encoding starting with 0XED that would indicate this
51 /// is part of an encoding of a UTF-16 surrogate pair (which is invalid UTF-8)
52 static constexpr byte three_byte_min_second_surrogate_byte{to_byte(std::uint8_t{0XA0})};
53
54 /// @brief The minumum value of a UTF-8 lead byte indicating a 4-byte sequence
55 static constexpr byte four_byte_initial_marker{to_byte(std::uint8_t{0XF0})};
56 /// @brief The minumum value of a UTF-8 lead byte indicating a 4-byte sequence
57 static constexpr byte four_byte_initial_low_marker{to_byte(std::uint8_t{0XF0})};
58 /// @brief The highest valid value of a UTF-8 lead byte for a 4-byte sequence
59 static constexpr byte four_byte_initial_high_byte{to_byte(std::uint8_t{0XF4})};
60 /// @brief The highest valid value of a UTF-8 lead byte
61 static constexpr byte maximum_initial_byte{to_byte(std::uint8_t{0XF4})};
62 /// @brief The minimum valid value of the second byte of a UTF-8 4-byte sequence starting with 0XF0 for it not to be
63 /// the start of an over-long encoding
64 static constexpr byte four_byte_low_min_valid_second_byte{to_byte(std::uint8_t{0X90})};
65 // parasoft-end-suppress AUTOSAR-A5_1_1-a
66
67 /// @brief The internal states of the state machine
68 enum class state_type : std::uint8_t {
69 initial_byte,
70 final_byte,
71 three_byte_second_high,
72 three_byte_second_surrogate,
73 three_byte_second_low,
74 four_byte_second,
75 four_byte_second_low,
76 four_byte_second_high,
77 four_byte_third,
78 };
79
80 /// @brief The current state of the state machine
81 state_type state_{state_type::initial_byte};
82
83 /// @brief Check the supplied byte for a valid initial byte in an encoded sequence, and update the state machine
84 /// accordingly.
85 /// @param value The byte to check
86 /// @return true if @c value is valid as an initial byte
87 /// @return false otherwise.
88 // NOLINTNEXTLINE(readability-avoid-const-params-in-decls) : AUTOSAR M3-9-1 requires this to match
89 constexpr auto check_initial_byte(byte const value) noexcept -> bool;
90
91 public:
92 /// @brief Check the supplied byte to see if it is the valid next byte in the UTF-8 encoded sequence being checked,
93 /// and update the state machine accordingly.
94 /// @param value The byte to check
95 /// @return true if @c value is valid as the next byte
96 /// @return false otherwise.
97 // NOLINTNEXTLINE(readability-avoid-const-params-in-decls) : AUTOSAR M3-9-1 requires this to match
98 constexpr auto check_next_byte(byte const value) noexcept -> bool;
99
100 // parasoft-begin-suppress AUTOSAR-A8_4_2-a "False positive: does return a value"
101 // parasoft-begin-suppress CERT_C-MSC37-a "False positive: does return a value"
102 // parasoft-begin-suppress CERT_CPP-MSC52-a "False positive: does return a value"
103 // parasoft-begin-suppress AUTOSAR-M9_3_3-a "False positive: cannot be 'static', uses non-static data member"
104 /// @brief Check if it's OK to end the UTF-8 encoded string after the current
105 /// set of checked bytes.
106 /// @return @c true if all bytes processed so far are valid, and the sequence
107 /// of validated bytes is not part way through a multi-byte UTF-8 encoded
108 /// charpoint, @c false otherwise
109 constexpr auto is_valid_end() const noexcept -> bool { return state_ == state_type::initial_byte; }
110 // parasoft-end-suppress AUTOSAR-M9_3_3-a
111 // parasoft-end-suppress CERT_CPP-MSC52-a
112 // parasoft-end-suppress CERT_C-MSC37-a
113 // parasoft-end-suppress AUTOSAR-A8_4_2-a
114};
115
116/// @brief Check the supplied byte for a valid initial byte in an encoded sequence, and update the state machine
117/// accordingly.
118/// @param value The byte to check
119/// @return true if @c value is valid as an initial byte
120/// @return false otherwise.
121constexpr auto validator::check_initial_byte(byte const value) noexcept -> bool {
122 if (value > maximum_initial_byte) {
123 return false;
124 }
125 if (value == four_byte_initial_high_byte) {
126 state_ = state_type::four_byte_second_high;
127 } else if (value == four_byte_initial_low_marker) {
128 state_ = state_type::four_byte_second_low;
129 } else if (value >= four_byte_initial_marker) {
130 state_ = state_type::four_byte_second;
131 } else if (value == three_byte_initial_surrogate_byte) {
132 state_ = state_type::three_byte_second_surrogate;
133 } else if (value >= three_byte_min_initial_not_overlong) {
134 state_ = state_type::three_byte_second_high;
135 } else if (value >= three_byte_min_initial_byte) {
136 state_ = state_type::three_byte_second_low;
137 } else if (value >= two_byte_initial_smallest) {
138 state_ = state_type::final_byte;
139 } else if (value >= min_continuation_byte) {
140 return false;
141 } else {
142 ; // no-op
143 }
144 return true;
145}
146
147/// @brief Check the supplied byte to see if it is the valid next byte in the UTF-8 encoded sequence being checked,
148/// and update the state machine accordingly.
149/// @param value The byte to check
150/// @return true if @c value is valid as the next byte
151/// @return false otherwise.
152constexpr auto validator::check_next_byte(byte const value) noexcept -> bool {
153 if ((state_ != state_type::initial_byte) && ((value < min_continuation_byte) || (value > max_continuation_byte))) {
154 return false;
155 }
156
157 // parasoft-begin-suppress AUTOSAR-M0_1_1-g "False positive: no statement or expression is placed outside case and
158 // default body"
159 //
160 // parasoft-begin-suppress AUTOSAR-M6_4_3-d "False positive: there is a case clause in switch statement"
161 // parasoft-begin-suppress AUTOSAR-A6_4_1-a "False positive: there are more than two case labels"
162 // parasoft-begin-suppress AUTOSAR-M6_4_3-a-2 "All branches terminated as permitted by M6-4-3 Permit #1"
163 // parasoft-begin-suppress AUTOSAR-M6_4_5-a-2 "All branches terminated as permitted by M6-4-5 Permit #1"
164 switch (state_) {
165 case state_type::initial_byte:
166 return check_initial_byte(value);
167 case state_type::three_byte_second_low:
168 if (value < three_byte_min_valid_second_not_overlong_byte) {
169 return false;
170 }
171 state_ = state_type::final_byte;
172 break;
173 case state_type::three_byte_second_surrogate:
174 if (value >= three_byte_min_second_surrogate_byte) {
175 return false;
176 }
177 // fall through
178 case state_type::three_byte_second_high:
179 case state_type::four_byte_third:
180 state_ = state_type::final_byte;
181 break;
182 case state_type::four_byte_second_high:
183 if (value >= four_byte_low_min_valid_second_byte) {
184 return false;
185 }
186 state_ = state_type::four_byte_third;
187 break;
188 case state_type::four_byte_second_low:
189 if (value < four_byte_low_min_valid_second_byte) {
190 return false;
191 }
192 // fall through
193 case state_type::four_byte_second:
194 state_ = state_type::four_byte_third;
195 break;
196 case state_type::final_byte:
197 state_ = state_type::initial_byte;
198 break;
199 default:
200 ARENE_INVARIANT_UNREACHABLE("Corrupt UTF-8 validator");
201 }
202 // parasoft-end-suppress AUTOSAR-M6_4_5-a-2
203 // parasoft-end-suppress AUTOSAR-M6_4_3-a-2
204 // parasoft-end-suppress AUTOSAR-A6_4_1-a
205 // parasoft-end-suppress AUTOSAR-M6_4_3-d
206 // parasoft-end-suppress AUTOSAR-M0_1_1-g
207 return true;
208}
209} // namespace utf8_validation_detail
210
211// parasoft-begin-suppress AUTOSAR-A8_4_2-a "False positive: does return a value"
212// parasoft-begin-suppress CERT_C-MSC37-a "False positive: does return a value"
213// parasoft-begin-suppress CERT_CPP-MSC52-a "False positive: does return a value"
214/// @brief Check if a string is valid UTF-8.
215///
216/// This rejects any incomplete encoding sequences, any overlong encodings (e.g. 7 bit encoded in 4 bytes), any
217/// encodings of UTF-16 surrogate pair code points, and any encoding that would decode to a codepoint beyond the max
218/// Unicode codepoint of 0X10FFFF.
219///
220/// @param str The string to check.
221/// @return true if the string is valid UTF-8
222/// @return false otherwise
223inline constexpr auto is_valid_utf8(string_view str) noexcept -> bool {
224 utf8_validation_detail::validator validator;
225 // parasoft-begin-suppress AUTOSAR-A3_9_1-b "False positive: This represents a character not a number"
226 for (char const chr : str) {
227 if (!validator.check_next_byte(to_byte(static_cast<unsigned char>(chr)))) {
228 return false;
229 }
230 }
231 // parasoft-end-suppress AUTOSAR-A3_9_1-b
232 return validator.is_valid_end();
233}
234// parasoft-end-suppress AUTOSAR-A8_4_2-a
235// parasoft-end-suppress CERT_C-MSC37-a
236// parasoft-end-suppress CERT_CPP-MSC52-a
237
238} // namespace base
239} // namespace arene
240
241#endif // INCLUDE_GUARD_ARENE_BASE_ARENE_BASE_STRINGS_UTF_UTF8_VALIDATION_HPP_
Definition array_exceptions_disabled.cpp:11
constexpr auto is_valid_utf8(string_view str) noexcept -> bool
Check if a string is valid UTF-8.
Definition utf8_validation.hpp:223
Copyright 2026, Toyota Motor Corporation.
Definition array_exceptions_disabled.cpp:10