Unicode has properties for each code point that include a general category , and the technical report lists regular expressions (alpha, graph, etc.). The unicode print classification includes tabs where std::isprint (using the C language) is not. print contains letters, labels, numbers, punctuation, characters, spaces, and formatting codes. Formatting code points do not include CR or LF , but include code points that affect the appearance of adjacent characters. I believe that this is exactly what you wanted (except for the tab); The specification has been carefully designed to support these character properties.
In most classification functions, such as std::isprint , only one scalar value can be specified at a time, so UTF32 is the obvious choice of encoding. Unfortunately, there is no guarantee that your system supports the UTF32 locale and it is not guaranteed that wchar_t is the necessary 20 bits needed to store all Unicode codes. So I would consider using boost::spirit::char_encoding::unicode for classification, if you can. It has an internal table of all Unicode categories and implements the classifications listed in the regular expression technical report. It looks like it uses an earlier Unicode 5.2 database, but the C ++ used to generate the tables is provided and can be applied to newer files.
The multibyte UTF8 sequence must still be converted to separate code points (UTF32), and you specifically pointed out the ability to skip past invalid UTF8 sequences. Since I am a C ++ programmer, I decided without the need to spam your screen and implement the constexpr function UTF8-> UTF32:
#include <cstdint> #include <iomanip> #include <iostream> #include <iterator> #include <boost/range/iterator_range.hpp> #include <boost/spirit/home/support/char_encoding/unicode.hpp> namespace { struct multi_byte_info { std::uint8_t id_mask; std::uint8_t id_matcher; std::uint8_t data_mask; }; constexpr const std::uint8_t multi_byte_id_mask = 0xC0; constexpr const std::uint8_t multi_byte_id_matcher = 0x80; constexpr const std::uint8_t multi_byte_data_mask = 0x3F; constexpr const std::uint8_t multi_byte_bits = 6; constexpr const multi_byte_info multi_byte_infos[] = { // skip 1 byte info {0xE0, 0xC0, 0x1F}, {0xF0, 0xE0, 0x0F}, {0xF8, 0xF0, 0x07}}; constexpr const unsigned max_length = (sizeof(multi_byte_infos) / sizeof(multi_byte_info)); constexpr const std::uint32_t overlong[] = {0x80, 0x800, 0x10000}; constexpr const std::uint32_t max_code_point = 0x10FFFF; } enum class extraction : std::uint8_t { success, failure }; struct extraction_attempt { std::uint32_t code_point; std::uint8_t bytes_processed; extraction status; }; template <typename Iterator> constexpr extraction_attempt next_code_point(Iterator position, const Iterator &end) { static_assert( std::is_same<typename std::iterator_traits<Iterator>::iterator_category, std::random_access_iterator_tag>{}, "bad iterator type"); extraction_attempt result{0, 0, extraction::failure}; if (end - position) { result.code_point = std::uint8_t(*position); ++position; ++result.bytes_processed; if (0x7F < result.code_point) { unsigned expected_length = 1; for (const auto info : multi_byte_infos) { if ((result.code_point & info.id_mask) == info.id_matcher) { result.code_point &= info.data_mask; break; } ++expected_length; } if (max_length < expected_length || (end - position) < expected_length) { return result; } for (unsigned byte = 0; byte < expected_length; ++byte) { const std::uint8_t next_byte = *(position + byte); if ((next_byte & multi_byte_id_mask) != multi_byte_id_matcher) { return result; } result.code_point <<= multi_byte_bits; result.code_point |= (next_byte & multi_byte_data_mask); ++result.bytes_processed; } if (max_code_point < result.code_point) { return result; } if (overlong[expected_length - 1] > result.code_point) { return result; } } result.status = extraction::success; } // end multi-byte processing return result; } template <typename Range> constexpr extraction_attempt next_code_point(const Range &range) { return next_code_point(std::begin(range), std::end(range)); } template <typename T> boost::iterator_range<T> next_character_bytes(const boost::iterator_range<T> &range, const extraction_attempt result) { return boost::make_iterator_range(range.begin(), range.begin() + result.bytes_processed); } template <std::size_t Length> constexpr bool test(const char (&range)[Length], const extraction expected_status, const std::uint32_t expected_code_point, const std::uint8_t expected_bytes_processed) { const extraction_attempt result = next_code_point(std::begin(range), std::end(range) - 1); switch (expected_status) { case extraction::success: return result.status == extraction::success && result.bytes_processed == expected_bytes_processed && result.code_point == expected_code_point; case extraction::failure: return result.status == extraction::failure && result.bytes_processed == expected_bytes_processed; default: return false; } } int main() { static_assert(test("F", extraction::success, 'F', 1), ""); static_assert(test("\0", extraction::success, 0, 1), ""); static_assert(test("\x7F", extraction::success, 0x7F, 1), ""); static_assert(test("\xFF\xFF", extraction::failure, 0, 1), ""); static_assert(test("\xDF", extraction::failure, 0, 1), ""); static_assert(test("\xDF\xFF", extraction::failure, 0, 1), ""); static_assert(test("\xC1\xBF", extraction::failure, 0, 2), ""); static_assert(test("\xC2\x80", extraction::success, 0x80, 2), ""); static_assert(test("\xDF\xBF", extraction::success, 0x07FF, 2), ""); static_assert(test("\xEF\xBF", extraction::failure, 0, 1), ""); static_assert(test("\xEF\xBF\xFF", extraction::failure, 0, 2), ""); static_assert(test("\xE0\x9F\xBF", extraction::failure, 0, 3), ""); static_assert(test("\xE0\xA0\x80", extraction::success, 0x800, 3), ""); static_assert(test("\xEF\xBF\xBF", extraction::success, 0xFFFF, 3), ""); static_assert(test("\xF7\xBF\xBF", extraction::failure, 0, 1), ""); static_assert(test("\xF7\xBF\xBF\xFF", extraction::failure, 0, 3), ""); static_assert(test("\xF0\x8F\xBF\xBF", extraction::failure, 0, 4), ""); static_assert(test("\xF0\x90\x80\x80", extraction::success, 0x10000, 4), ""); static_assert(test("\xF4\x8F\xBF\xBF", extraction::success, 0x10FFFF, 4), ""); static_assert(test("\xF7\xBF\xBF\xBF", extraction::failure, 0, 4), ""); static_assert(test("π«", extraction::success, 0x1D56B, 4), ""); constexpr const static char text[] = "Hello γγ«γΎ β¦ π π«βπ’ \x02\x01\b \xff\xff\xff "; std::cout << text << std::endl; auto data = boost::make_iterator_range(text); while (!data.empty()) { const extraction_attempt result = next_code_point(data); switch (result.status) { case extraction::success: if (boost::spirit::char_encoding::unicode::isprint(result.code_point)) { std::cout << next_character_bytes(data, result); break; } default: case extraction::failure: std::cout << "["; std::cout << std::hex << std::setw(2) << std::setfill('0'); for (const auto byte : next_character_bytes(data, result)) { std::cout << int(std::uint8_t(byte)); } std::cout << "]"; break; } data.advance_begin(result.bytes_processed); } return 0; }
Conclusion:
Hello γγ«γΎ β¦ π π«βπ’ Hello γγ«γΎ β¦ π π«βπ’ [02][01][08] [ff][ff][ff] [00]
If my implementation of UTF8-> UTF32 scares you or you need user locale support:
std::mbtoc32- It is impressive because it is the most obvious choice, and yet not implemented in libstdC ++ or libC ++ (maybe prefabricated lines?)
- Does not restart (current locale and does not change elsewhere abruptly)
- iterators provided by boost .
- Throws out invalid sequences, making it unusable (cannot go past unsuccessful sequences).
boost::locale::conv and C ++ 11 std::codecvt- Designed to convert encoding ranges.
- You must either output UTF32 to the console (change the locale) or convert the character in time to match the original byte (s) with the value of UTF32.
- UTF8-CPP
utf::next (and do not throw utf8::internal::validate_next ).
it: a reference to an iterator pointing to the start of a UTF-8 encoded code point. After the function returns, it will increase, indicating the beginning of the next code point.
which does not indicate side effects on exceptions (there are definitely some there).