In UTF-8, each code point (= logical symbol) is represented by several code units (= char); In particular, ɑftənun:
ch| c.p. | c.u.
--+------+-------
ɑ | 0251 | c9 91
f | 0066 | 66
t | 0074 | 74
ə | 0259 | c9 99
n | 006e | 6e
u | 0075 | 75
n | 006e | 6e
(ch = , cp: , cp UTF-8, cu cp )
, , ; :
- 0x7f ; ;
- 0x80 ; ;
- , ; , , .
, UTF-8 , .
c9 0a
( , ), , UTF-8, c9 , cu ; , .. , cu , ə.
, ( ), std::string - std::string , std::vector<char>, ; , , / , .
, , ; utf8-cpp , ; utf8::next :
while (source >> word >> word_ipa) {
auto cur = word_ipa.begin();
auto end = word_ipa.end();
auto next = cur;
for(;cur!=end; cur=next) {
utf8::next(next, end);
myfile << word << "is ";
for(; cur!=next; ++cur) myfile<<*cur;
myfile << "\n";
}
}
utf8::next , , ; , , .
, , UTF-8 (. ):
template<typename ItT>
void safe_advance(ItT &it, size_t n, ItT end) {
size_t d = std::distance(it, end);
if(n>d) throw std::logic_error("Truncated UTF-8 sequence");
std::advance(it, n);
}
template<typename ItT>
void my_next(ItT &it, ItT end) {
uint8_t b = *it;
if(b>>7 == 0) safe_advance(it, 1, end);
else if(b>>5 == 6) safe_advance(it, 2, end);
else if(b>>4 == 14) safe_advance(it, 3, end);
else if(b>>3 == 30) safe_advance(it, 4, end);
else throw std::logic_error("Invalid UTF-8 sequence");
}
, , .
( , UTF-8 UTF-8, , , , )
OTOH, , , :
while (source >> word >> word_ipa) {
auto cur = word_ipa.begin();
auto end = word_ipa.end();
for(;cur!=end;) {
myfile << word << "is "<<*cur;
if(uint8_t(*cur++)>>7 != 0) {
for(; cur!=end && (uint8_t(*cur)>>6)==2; ++cur) myfile<<*cur;
}
myfile << "\n";
}
}
" " , , ; , cu , 10 ( , AKA 2 ), " cu" UUF-8 .