Go: How do I know the properties of a Unicode rune?

I want to know the properties of the Unicode runes, especially the value of the script property. Unicode has this to say (at http://www.unicode.org/reports/tr24/ Section 1.5):

The script property assigns a single value to each character, either
explicitly associating it with a particular script, or assigning one
of several specail [sic] values.

Go unicodeoffers me a way to ask: “Is this a rune in script x?”, But I don’t need to ask: “What is this rune script?”. I could obviously go through all the scenarios, but that would be wasteful. Is there a smarter way to find out the rune script? (I could always implement a self-organizing list, but I'm looking for something in the standard go libraries that already do what I want and what I forgot.)

Thanks everyone!

+4
source share
2 answers

- . ,

package main

import (
    "fmt"
    "unicode"
)

var runeScript map[rune]string

func init() {
    const nChar = 128172 // Version 9.0.0
    runeScript = make(map[rune]string, nChar*125/100)
    for s, rt := range unicode.Scripts {
        for _, r := range rt.R16 {
            for i := r.Lo; i <= r.Hi; i += r.Stride {
                runeScript[rune(i)] = s
            }
        }
        for _, r := range rt.R32 {
            for i := r.Lo; i <= r.Hi; i += r.Stride {
                runeScript[rune(i)] = s
            }
        }
    }
}

func script(r rune) string {
    return runeScript[r]
}

func main() {
    chars := []rune{' ', '0', 'a', 'α', '', 'ㄱ'}
    for _, c := range chars {
        s := script(c)
        fmt.Printf("%q %s\n", c, s)
    }
}

:

$ go run script.go
' ' Common
'0' Common
'a' Latin
'α' Greek
'' Cyrillic
'ㄱ' Hangul
$ 
+5

PeterSO

PeterSO . , , string. , string , (. reflect.StringHeader), - (, 6 )!

string ( script) (137), byte, , script.

:

var runeScript map[rune]byte

var names = []string{""}

func init() {
    const nChar = 128172 // Version 9.0.0
    runeScript = make(map[rune]byte, nChar*125/100)
    for s, rt := range unicode.Scripts {
        idx := byte(len(names))
        names = append(names, s)
        for _, r := range rt.R16 {
            for i := r.Lo; i <= r.Hi; i += r.Stride {
                runeScript[rune(i)] = idx
            }
        }
        for _, r := range rt.R32 {
            for i := r.Lo; i <= r.Hi; i += r.Stride {
                runeScript[rune(i)] = idx
            }
        }
    }
}

func script(r rune) string {
    return names[runeScript[r]]
}

func main() {
    chars := []rune{' ', '0', 'a', 'α', '', 'ㄱ'}
    for _, c := range chars {
        s := script(c)
        fmt.Printf("%q %s\n", c, s)
    }
}

map[rune]string. ( Go Playground):

' ' Common
'0' Common
'a' Latin
'α' Greek
'' Cyrillic
'ㄱ' Hangul

map[rune]byte 2 , , .

/. , "" , ( 2 , 16- Unicode, 32- Unicode).

, : 852 ( 100 000 ). 2 , 852 , № 1.

script (), . ( # 1), 852 , .

, (~ 400 , : 7 max, : 15 ).

, . :

type myR16 struct {
    r16    unicode.Range16
    script string
}

type myR32 struct {
    r32    unicode.Range32
    script string
}

:

var allR16 = []*myR16{}
var allR32 = []*myR32{}

/ :

func init() {
    for script, rt := range unicode.Scripts {
        for _, r16 := range rt.R16 {
            allR16 = append(allR16, &myR16{r16, script})
        }
        for _, r32 := range rt.R32 {
            allR32 = append(allR32, &myR32{r32, script})
        }
    }

    // sort
    sort.Slice(allR16, func(i int, j int) bool {
        return allR16[i].r16.Lo < allR16[j].r16.Lo
    })
    sort.Slice(allR32, func(i int, j int) bool {
        return allR32[i].r32.Lo < allR32[j].r32.Lo
    })
}

, , :

func script(r rune) string {
    // binary search over ranges
    if r <= 0xffff {
        r16 := uint16(r)
        i := sort.Search(len(allR16), func(i int) bool {
            return allR16[i].r16.Hi >= r16
        })

        if i < len(allR16) && allR16[i].r16.Lo <= r16 && r16 <= allR16[i].r16.Hi {
            return allR16[i].script
        }
    }

    r32 := uint32(r)
    i := sort.Search(len(allR32), func(i int) bool {
        return allR32[i].r32.Hi >= r32
    })

    if i < len(allR32) && allR32[i].r32.Lo <= r32 && r32 <= allR32[i].r32.Hi {
        return allR32[i].script
    }

    return ""
}

: Stride 1 unicode, ( ).

, . Go Playground.

+3

Source: https://habr.com/ru/post/1673295/


All Articles