How to change a line containing surrogate pairs

I wrote this method to change the string

public string Reverse(string s)
        {
            if(string.IsNullOrEmpty(s)) 
                return s;

            TextElementEnumerator enumerator =
               StringInfo.GetTextElementEnumerator(s);

            var elements = new List<char>();
            while (enumerator.MoveNext())
            {
                var cs = enumerator.GetTextElement().ToCharArray();
                if (cs.Length > 1)
                {
                    elements.AddRange(cs.Reverse());
                }
                else
                {
                    elements.AddRange(cs);
                }
            }

            elements.Reverse();
            return string.Concat(elements);
        }

Now I do not want to start a discussion about how this code can be made more efficient or how there is one liner that I could use instead. I know that you can do Xors and all sorts of other things to potentially improve this code. If I want to reorganize the code later, I could do it easily, since I have unit tests.

Currently, this correctly changes BML strings (including strings with type accents "Les Misérables") and strings containing combined characters, such as "Les Mise\u0301rables".

My test containing surrogate pairs works if they are expressed as follows

Assert.AreEqual("𠈓", _stringOperations.Reverse("𠈓"));

But if I express surrogate pairs like this

Assert.AreEqual("\u10000", _stringOperations.Reverse("\u10000"));

. , ?

- , , , .

+4
4

\u10000 : က ( 1000), 0 ( s ). , .

, Unicode Character 'LINEAR B SYLLABLE B008 A' (U + 10000) 10000. Unicode escape- MSDN:

\ u

\ U

, .

\U00010000 ( U) \uD800\uDC00 \u10000.

+5

Necromancing.

, List<char>.Reverse List<string>.Reverse

// using System.Globalization;

TextElementEnumerator enumerator =
    StringInfo.GetTextElementEnumerator("Les Mise\u0301rables");

List<string> elements = new List<string>();
while (enumerator.MoveNext())
    elements.Add(enumerator.GetTextElement());

elements.Reverse();
string reversed = string.Concat(elements);  // selbarésiM seL

Jon Skeet pony : https://vimeo.com/7403673

(, ):

public static class Test
{

    private static System.Collections.Generic.List<string> GraphemeClusters(string s)
    {
        System.Collections.Generic.List<string> ls = new System.Collections.Generic.List<string>();

        System.Globalization.TextElementEnumerator enumerator = System.Globalization.StringInfo.GetTextElementEnumerator(s);
        while (enumerator.MoveNext())
        {
            ls.Add((string)enumerator.Current);
        }

        return ls;
    }


    // this 
    private static string ReverseGraphemeClusters(string s)
    {
         if(string.IsNullOrEmpty(s) || s.Length == 1)
              return s;

        System.Collections.Generic.List<string> ls = GraphemeClusters(s);
        ls.Reverse();

        return string.Join("", ls.ToArray());
    }

    public static void TestMe()
    {
        string s = "Les Mise\u0301rables";
        string r = ReverseGraphemeClusters(s);

        // This would be wrong:
        // char[] a = s.ToCharArray();
        // System.Array.Reverse(a);
        // string r = new string(a);

        System.Console.WriteLine(r);
    }
}

,
-
- (8 ) /​​ (32 )
- GraphemeCluster [32+ ] ( Grapheme/Glyph)

:

- , .

- . . - , Unicode.

- , , . , a, ä , (, ä , a, ; , , ). - (, ).

- , ( ), . , , ä , , , . OTF GSUB GPOS , . .

+1

. , , , , .

internal static string ReverseItWithSurrogate(string stringToReverse)
{
    string result = string.Empty;

    // We want to get the string into a character array first
    char[] stringArray = stringToReverse.ToCharArray();

    // This is the object that will hold our reversed string.
    var sb = new StringBuilder();
    bool haveSurrogate = false;

    // We are starting at the back and looking at each character.  if it is a
    // low surrogate and the one prior is a high and not < 0, then we have a surrogate pair.
    for (int loopVariable = stringArray.Length - 1; loopVariable >= 0; loopVariable--)
    {
    // we cant' check the high surrogate if the low surrogate is index 0
    if (loopVariable > 0)
    {
        haveSurrogate = false;

        if (char.IsLowSurrogate(stringArray[loopVariable]) &&    char.IsHighSurrogate(stringArray[loopVariable - 1]))
       {
          sb.Append(stringArray[loopVariable - 1]);
          sb.Append(stringArray[loopVariable]);

         // and force the second character to drop from our loop
         loopVariable--;
         haveSurrogate = true;
       }

      if (!haveSurrogate)
      {
         sb.Append(stringArray[loopVariable]);
        }
       }
    else
    {
     // Now we have to handle the first item in the list if it is not a high surrogate.
      if (!haveSurrogate)
      {
        sb.Append(stringArray[loopVariable]);
       }
     }
   }

result = sb.ToString();
return result;
}
0

Chrome!

using System.Linq;
using System.Collections.Generic;
using System;
using System.Globalization;
using System.Diagnostics;
using System.Collections;
namespace OrisNumbers
{
    public static class IEnumeratorExtensions
    {
        public static IEnumerable<T> AsIEnumerable<T>(this IEnumerator iterator)
        {
            while (iterator.MoveNext())
            {
                yield return (T)iterator.Current;
            }
        }
    }
    class Program
    {
        static void Main(string[] args)
        {
            var s = "foo 𝌆 bar mañana mañana" ;
            Debug.WriteLine(s);
            Debug.WriteLine(string.Join("", StringInfo.GetTextElementEnumerator(s.Normalize()).AsIEnumerable<string>().Reverse()));
            Console.Read();
        }
    }
}
0
source

Source: https://habr.com/ru/post/1529551/


All Articles