2

I want to parse a string that contains a number in either decimal, octal or hex formats, and I want to do so with zero heap allocations. Therefore Regex and substrings are out.

The base is determined via a prefix:

  • 1234 is decimal
  • 01234 is octal due to leading zero (668 decimal)
  • 0x1234 is hex due to leading 0x (4660 decimal)

Hex values may be prefixed with 0x, 0X or 0h. The A-F characters may be in upper or lower case.

The method should have signature:

bool TryParse(string s, out int i)

The method should stop parsing once it reaches the end of s, or once a space is reached. That is, all of the following should produce the same integer value of 1234:

"1234"
"1234  "
"0X4D2"
"0h4D2"
"0x4d2"
"0x4D2  "
"02322"
"02322  "

I'm going to set about coding this myself, but wanted to put it out there in case someone can share an existing solution. If not, I'll share my solution once it's done.

Drew Noakes
  • 300,895
  • 165
  • 679
  • 742

1 Answers1

2

I ended up writing a state machine to parse such values in a single pass, and without any heap allocation (as far as I can tell).

Here's the method:

public static bool TryParse(string s, out int i)
{
    const int starting = 0;
    const int negative = 1;
    const int leadingZero = 2;
    const int dec = 3;
    const int oct = 4;
    const int startingHex = 5;
    const int hex = 6;

    var state = starting;
    var value = 0;
    var isNegative = false;

    // ReSharper disable once ForCanBeConvertedToForeach
    for (var idx = 0; idx < s.Length; idx++)
    {
        var c = s[idx];

        switch (state)
        {
            case starting:
            case negative:
            {
                switch (c)
                {
                    case '0':
                        state = leadingZero;
                        break;
                    case '-':
                        if (state == starting)
                        {
                            isNegative = true;
                            state = negative;
                            break;
                        }
                        i = default(int);
                        return false;
                    default:
                        if (char.IsDigit(c))
                        {
                            value = c - '0';
                            state = dec;
                            break;
                        }
                        if (state == starting && char.IsWhiteSpace(c))
                            break;
                        i = default(int);
                        return false;
                }
                break;
            }
            case leadingZero:
            {
                switch (c)
                {
                    case 'x':
                    case 'X':
                    case 'h':
                        state = startingHex;
                        break;
                    case ' ':
                        i = 0;
                        return true;
                    default:
                        if (char.IsDigit(c))
                        {
                            value = c - '0';
                            state = oct;
                            break;
                        }
                        i = default(int);
                        return false;
                }
                break;
            }
            case dec:
            {
                if (char.IsDigit(c))
                {
                    value *= 10;
                    value += c - '0';
                    continue;
                }
                if (char.IsWhiteSpace(c))
                {
                    i = isNegative ? -value : value;
                    return true;
                }
                i = default(int);
                return false;
            }
            case oct:
            {
                var v = c - '0';
                if (v >= 0 && v < 8)
                {
                    value *= 8;
                    value += v;
                    continue;
                }
                if (char.IsWhiteSpace(c))
                {
                    i = isNegative ? -value : value;
                    return true;
                }
                i = default(int);
                return false;
            }
            case hex:
            case startingHex:
            {
                if (c >= '0' && c <= '9')
                {
                    state = hex;
                    var v = c - '0';
                    value *= 16;
                    value += v;
                    continue;
                }
                var cl = char.ToLower(c);
                if (cl >= 'a' && c <= 'f')
                {
                    state = hex;
                    var v = cl - 'a' + 10;
                    value *= 16;
                    value += v;
                    continue;
                }
                if (state == hex && char.IsWhiteSpace(c))
                {
                    i = isNegative ? -value : value;
                    return true;
                }
                i = default(int);
                return false;
            }
        }
    }

    switch (state)
    {
        case dec:
        case oct:
        case hex:
            i = isNegative ? -value : value;
            return true;
        case leadingZero:
            i = 0;
            return true;
        default:
            i = 0;
            return false;
    }
}

And an xUnit test:

[Fact]
public void Parse()
{
    void Test(string s, int expected)
    {
        Assert.True(ParseUtil.TryParse(s, out var actual));
        Assert.Equal(expected, actual);
    }

    void TestFails(string s) => Assert.False(ParseUtil.TryParse(s, out var _));

    Test("1234", 1234);
    Test("1234 ", 1234);
    Test("1234  ", 1234);
    Test("0X4D2", 1234);
    Test("0h4D2", 1234);
    Test("0x4d2", 1234);
    Test("0x4D2  ", 1234);
    Test("02322", 1234);
    Test("02322  ", 1234);
    Test("002322  ", 1234);
    Test("0002322  ", 1234);

    Test("-1234", -1234);
    Test("-1234 ", -1234);
    Test("-1234  ", -1234);
    Test("-0X4D2", -1234);
    Test("-0h4D2", -1234);
    Test("-0x4d2", -1234);
    Test("-0x4D2  ", -1234);
    Test("-02322", -1234);
    Test("-02322  ", -1234);
    Test("-002322  ", -1234);
    Test("-0002322  ", -1234);

    Test(" 1234", 1234);
    Test(" 1234 ", 1234);
    Test("  1234  ", 1234);
    Test(" 0X4D2", 1234);
    Test(" 0h4D2", 1234);
    Test(" 0x4d2", 1234);
    Test(" 0x4D2  ", 1234);
    Test(" 02322", 1234);
    Test(" 02322  ", 1234);
    Test(" 002322  ", 1234);
    Test(" 0002322  ", 1234);

    Test("0", 0);
    Test("00", 0);
    Test("000", 0);
    Test("0x0", 0);
    Test(" 0 ", 0);
    Test(" 00 ", 0);
    Test(" 000 ", 0);
    Test(" 0x0 ", 0);

    TestFails("Hello");
    TestFails("0Hello");
    TestFails("0xx1234");
    TestFails("04D2");
    TestFails("4D2");
    TestFails("098LKJ");
    TestFails("0x");
    TestFails("0x ");
    TestFails(" 0x ");
    TestFails("- 123");
    TestFails("--123");
}

Hope it can help someone else out.

Drew Noakes
  • 300,895
  • 165
  • 679
  • 742
  • You should have a check for the input length. `TestFails("0xfffffffff") // 9 * f, should pass because number is to large` does not pass because it will overflow but still report success. – RedX Sep 13 '17 at 12:55