Character set expansion algorithm?

Are there any ready-made functions for extending a C # regex style?

For example, expand("a-z1")returns a string containing all characters from a to z followed by number 1.


Here is what I have so far:

public static string ExpandCharacterSet(string set)
{
    var sb = new StringBuilder();
    int start = 0;
    while (start < set.Length - 1)
    {
        int dash = set.IndexOf('-', start + 1);

        if (dash <= 0 || dash >= set.Length - 1)
            break;

        sb.Append(set.Substring(start, dash - start - 1));

        char a = set[dash - 1];
        char z = set[dash + 1];

        for (var i = a; i <= z; ++i)
            sb.Append(i);

        start = dash + 2;
    }

    sb.Append(set.Substring(start));
    return sb.ToString();
}

Is there anything I am missing?

PS: Let negative character sets be ignored for now.


Thought my example was perfectly clear ... let him try it again. This is what I want:

ExpandCharacterSet("a-fA-F0-9") == "abcdefABCDEF0123456789"
+3
source share
4 answers
private static readonly IEnumerable<char> CharacterSet = Enumerable.Range(0, char.MaxValue + 1).Select(Convert.ToChar).Where(c => !char.IsControl(c));

public static string ExpandCharacterSet(string set)
{
    var sb = new StringBuilder();
    int start = 0;
    bool invertSet = false;

    if (set.Length == 0) 
        return "";
    if (set[0] == '[' && set[set.Length - 1] == ']')
        set = set.Substring(1, set.Length - 2);
    if (set[0] == '^')
    {
        invertSet = true;
        set = set.Substring(1);
    }

    while (start < set.Length - 1)
    {
        int dash = set.IndexOf('-', start + 1);

        if (dash <= 0 || dash >= set.Length - 1)
            break;

        sb.Append(set.Substring(start, dash - start - 1));

        char a = set[dash - 1];
        char z = set[dash + 1];

        for (var i = a; i <= z; ++i)
            sb.Append(i);

        start = dash + 2;
    }

    sb.Append(set.Substring(start));

    if (!invertSet) return sb.ToString();

    var A = new HashSet<char>(CharacterSet);
    var B = new HashSet<char>(sb.ToString());
    A.ExceptWith(B);
    return new string(A.ToArray());
}
0
source

, , , . , , . . , . , - . , , , . , . .NET Framework 4.0:

public static class RegexHelper
{
    public static string ExpandCharClass(string charClass)
    {
        var regexParser = new RegexParser(CultureInfo.CurrentCulture);
        regexParser.SetPattern(charClass);
        var regexCharClass = regexParser.ScanCharClass(false);
        int count = regexCharClass.RangeCount();
        List<string> ranges = new List<string>();
        // range 0 can be skipped
        for (int i = 1; i < count; i++)
        {
            var range = regexCharClass.GetRangeAt(i);
            ranges.Add(ExpandRange(range));
        }
        return String.Concat(ranges);
    }

    static string ExpandRange(SingleRange range)
    {
        char first = range._first;
        char last = range._last;
        return String.Concat(Enumerable.Range(first, last - first + 1).Select(i => (char)i));
    }

    internal class RegexParser
    {
        static readonly Type RegexParserType;
        static readonly ConstructorInfo RegexParser_Ctor;
        static readonly MethodInfo RegexParser_SetPattern;
        static readonly MethodInfo RegexParser_ScanCharClass;

        static RegexParser()
        {
            RegexParserType = Assembly.GetAssembly(typeof(Regex)).GetType("System.Text.RegularExpressions.RegexParser");

            var flags = BindingFlags.NonPublic | BindingFlags.Instance;
            RegexParser_Ctor = RegexParserType.GetConstructor(flags, null, new[] { typeof(CultureInfo) }, null);
            RegexParser_SetPattern = RegexParserType.GetMethod("SetPattern", flags, null, new[] { typeof(String) }, null);
            RegexParser_ScanCharClass = RegexParserType.GetMethod("ScanCharClass", flags, null, new[] { typeof(Boolean) }, null);
        }

        private readonly object instance;

        internal RegexParser(CultureInfo culture)
        {
            instance = RegexParser_Ctor.Invoke(new object[] { culture });
        }

        internal void SetPattern(string pattern)
        {
            RegexParser_SetPattern.Invoke(instance, new object[] { pattern });
        }

        internal RegexCharClass ScanCharClass(bool caseInsensitive)
        {
            return new RegexCharClass(RegexParser_ScanCharClass.Invoke(instance, new object[] { caseInsensitive }));
        }
    }

    internal class RegexCharClass
    {
        static readonly Type RegexCharClassType;
        static readonly MethodInfo RegexCharClass_RangeCount;
        static readonly MethodInfo RegexCharClass_GetRangeAt;

        static RegexCharClass()
        {
            RegexCharClassType = Assembly.GetAssembly(typeof(Regex)).GetType("System.Text.RegularExpressions.RegexCharClass");

            var flags = BindingFlags.NonPublic | BindingFlags.Instance;
            RegexCharClass_RangeCount = RegexCharClassType.GetMethod("RangeCount", flags, null, new Type[] { }, null);
            RegexCharClass_GetRangeAt = RegexCharClassType.GetMethod("GetRangeAt", flags, null, new[] { typeof(Int32) }, null);
        }

        private readonly object instance;

        internal RegexCharClass(object regexCharClass)
        {
            if (regexCharClass == null)
                throw new ArgumentNullException("regexCharClass");
            if (regexCharClass.GetType() != RegexCharClassType)
                throw new ArgumentException("not an instance of a RegexCharClass object", "regexCharClass");
            instance = regexCharClass;
        }

        internal int RangeCount()
        {
            return (int)RegexCharClass_RangeCount.Invoke(instance, new object[] { });
        }

        internal SingleRange GetRangeAt(int i)
        {
            return new SingleRange(RegexCharClass_GetRangeAt.Invoke(instance, new object[] { i }));
        }
    }

    internal struct SingleRange
    {
        static readonly Type RegexCharClassSingleRangeType;
        static readonly FieldInfo SingleRange_first;
        static readonly FieldInfo SingleRange_last;

        static SingleRange()
        {
            RegexCharClassSingleRangeType = Assembly.GetAssembly(typeof(Regex)).GetType("System.Text.RegularExpressions.RegexCharClass+SingleRange");

            var flags = BindingFlags.NonPublic | BindingFlags.Instance;
            SingleRange_first = RegexCharClassSingleRangeType.GetField("_first", flags);
            SingleRange_last = RegexCharClassSingleRangeType.GetField("_last", flags);
        }

        internal char _first;
        internal char _last;

        internal SingleRange(object singleRange)
        {
            if (singleRange == null)
                throw new ArgumentNullException("singleRange");
            if (singleRange.GetType() != RegexCharClassSingleRangeType)
                throw new ArgumentException("not an instance of a SingleRange object", "singleRange");
            _first = (char)SingleRange_first.GetValue(singleRange);
            _last = (char)SingleRange_last.GetValue(singleRange);
        }
    }
}

// usage:
RegexHelper.ExpandCharClass(@"[\-a-zA-F1 5-9]");
// "-abcdefghijklmnopqrstuvwxyzABCDEF1 56789"
+2

, 96 , ( ), :

public static string expando(string input_re) {

    // add more chars in s as needed, such as ,.?/|=+_-éñ etc.
    string s = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
    string output = "";

    Regex exp = new Regex(input_re);

    for (int i = 0; i < s.Length; i++) {
        if (exp.IsMatch(s.Substring(i, 1))) {
            output += s[i];
        }
    }

    return output;
}

, , , [^A-B]|[0123a-cg-h].

+1
source

Something like that?

var input = "a-fA-F0-9!";
var matches = Regex.Matches(input,@".-.|.");

var list = new StringBuilder();

foreach (Match m in matches)
{
    var value = m.Value;

    if (value.Length == 1)
        list.Append(value);
    else
    {
        if (value[2] < value[0]) throw new ArgumentException("invalid format"); // or switch, if you want.
        for (char c = value[0]; c <= value[2]; c++)
            list.Append(c);
    }
}

Console.WriteLine(list);

Conclusion:

abcdefABCDEF0123456789!

The moral, of course, is to solve your regular expression problems with lots of regular expressions!


Here is a version with support for escape characters. It all depends on how much you would like it to be ... for example, I'm not doing anything special here to handle surrogates, so it probably won't work. Also, if you are trying to match the performance of the current regex engine, you need to know exactly what all the parameters are, which would be quite a lot of work.

void Main()
{
        //these are all equivalent:
        var input = @"\x41-\0x46\u41";
        var input2 = @"\65-\70\65";
        var input3 = "A-FA";

        // match hex as \0x123 or \x123 or \u123, or decimal \412, or the escapes \n\t\r, or any character
        var charRegex = @"(\\(0?x|u)[0-9a-fA-F]+|\\[0-9]+|\\[ntr]|.)";
        var matches = Regex.Matches(input, charRegex + "-" + charRegex + "|" + charRegex);

        var list = new StringBuilder();

        foreach (Match m in matches)
        {
            var dashIndex = m.Value.IndexOf('-', 1); //don't look at 0 (in case it a dash)
            if (dashIndex > 0) // this means we have two items: a range
            {   
                var charLeft = Decode(m.Value.Substring(0,dashIndex));
                var charRight = Decode(m.Value.Substring(dashIndex+1));
                if (charRight < charLeft) throw new ArgumentException("invalid format (left bigger than right)"); // or switch, if you want.
                for (char c = charLeft; c <= charRight; c++)
                    list.Append(c);
            }
            else // just one item
            {
                list.Append(Decode(m.Value));
            }   
        }

        Console.WriteLine(list);
}

char Decode(string s)
{
    if (s.Length == 1)
        return s[0];

    // here, s[0] == '\', because of the regex
    if (s.Length == 2)
        switch (s[1])
        {
        // incomplete; add more as wished
        case 'n': return '\n';
        case 't': return '\t';
        case 'r': return '\r';
        default: break;
        }

    if (s[1] == 'u' || s[1] == 'x')
        return (char)Convert.ToUInt16(s.Substring(2), 16);
    else if (s.Length > 2 && s[1] == '0' && s[2] == 'x')
        return (char)Convert.ToUInt16(s.Substring(3), 16);
    else
        return (char)Convert.ToUInt16(s.Substring(1)); // will fail from here if invalid escape (e.g. \g)
}
+1
source

Source: https://habr.com/ru/post/1776801/


All Articles