From any UTF-16 offset, find the corresponding String.Index that lies at the border of the character

My goal: an arbitrary UTF-16 position is given in String, find the corresponding String.Indexone that represents Character(i.e. an expanded grapheme cluster) the specified UTF-16 block of code is part.

Example:

(I put the code in Gist for easy copying and pasting.)

This is my test line:

let str = "๐Ÿ‘จ๐Ÿพโ€๐Ÿš’"

(Note: to see a string as a single character, you need to read it in a fairly recent OS / browser combination that can handle the new emoji profession with skin tones introduced in Unicode 9.)

This is one cluster Character(grapheme), which consists of four Unicode scanners or 7 UTF-16:

print(str.unicodeScalars.map { "0x\(String($0.value, radix: 16))" })
// โ†’ ["0x1f468", "0x1f3fe", "0x200d", "0x1f692"]
print(str.utf16.map { "0x\(String($0, radix: 16))" })
// โ†’ ["0xd83d", "0xdc68", "0xd83c", "0xdffe", "0x200d", "0xd83d", "0xde92"]
print(str.utf16.count)
// โ†’ 7

UTF-16 (, 2), String.Index:

let utf16Offset = 2
let utf16Index = String.Index(encodedOffset: utf16Offset)

, Character, Character, , :

let char = str[utf16Index]
print(char)
// โ†’ ๐Ÿพโ€๐Ÿš’
print(char.unicodeScalars.map { "0x\(String($0.value, radix: 16))" })
// โ†’ ["0x1f3fe", "0x200d", "0x1f692"]

( , ):

let trappingIndex = String.Index(encodedOffset: 1)
str[trappingIndex]
// fatal error: Can't form a Character from a String containing more than one extended grapheme cluster

, Character:

extension String.Index {
    func isOnCharacterBoundary(in str: String) -> Bool {
        return String.Index(self, within: str) != nil
    }
}

trappingIndex.isOnCharacterBoundary(in: str)
// โ†’ false (as expected)
utf16Index.isOnCharacterBoundary(in: str)
// โ†’ true (WTF!)

:

, , true. String.Index.init(_:within:) :

, sourcePosition, grapheme - - .

utf16Index grapheme - 0, 2. .

encodedOffset isOnCharacterBoundary .

- ? , Character? Swift?

: Swift 4.0/Xcode 9.0 macOS 10.13.

: Twitter .

: String.Index.init?(_:within:) Swift 4.0 : SR-5992.

+4
1

, rangeOfComposedCharacterSequence(at:) :

extension String {
    func index(utf16Offset: Int) -> String.Index? {
        guard utf16Offset >= 0 && utf16Offset < utf16.count else { return nil }
        let idx = String.Index(encodedOffset: utf16Offset)
        let range = rangeOfComposedCharacterSequence(at: idx)
        return range.lowerBound
    }
}

:

let str = "a๐Ÿ‘จ๐Ÿพโ€๐Ÿš’b๐Ÿ‡ฉ๐Ÿ‡ชc๐Ÿ˜€d๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘งe"
for utf16Offset in 0..<str.utf16.count {
    if let idx = str.index(utf16Offset: utf16Offset) {
        print(utf16Offset, str[idx])
    }
}

:

0 a
1 ๐Ÿ‘จ๐Ÿพโ€๐Ÿš’
2 ๐Ÿ‘จ๐Ÿพโ€๐Ÿš’
3 ๐Ÿ‘จ๐Ÿพโ€๐Ÿš’
4 ๐Ÿ‘จ๐Ÿพโ€๐Ÿš’
5 ๐Ÿ‘จ๐Ÿพโ€๐Ÿš’
6 ๐Ÿ‘จ๐Ÿพโ€๐Ÿš’
7 ๐Ÿ‘จ๐Ÿพโ€๐Ÿš’
8 b
9 ๐Ÿ‡ฉ๐Ÿ‡ช
10 ๐Ÿ‡ฉ๐Ÿ‡ช
11 ๐Ÿ‡ฉ๐Ÿ‡ช
12 ๐Ÿ‡ฉ๐Ÿ‡ช
13 c
14 ๐Ÿ˜€
15 ๐Ÿ˜€
16 d
17 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
18 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
19 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
20 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
21 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
22 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
23 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
24 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
25 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
26 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
27 ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง
28 e 
+2

Source: https://habr.com/ru/post/1686350/


All Articles