Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

boost/beast/http/detail/basic_parser.ipp

//
// Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Official repository: https://github.com/boostorg/beast
//

#ifndef BOOST_BEAST_HTTP_DETAIL_BASIC_PARSER_IPP
#define BOOST_BEAST_HTTP_DETAIL_BASIC_PARSER_IPP

#include <boost/beast/http/detail/basic_parser.hpp>
#include <limits>

namespace boost {
namespace beast {
namespace http {
namespace detail {

char const*
basic_parser_base::
trim_front(char const* it, char const* end)
{
    while(it != end)
    {
        if(*it != ' ' && *it != '\t')
            break;
        ++it;
    }
    return it;
}

char const*
basic_parser_base::
trim_back(
    char const* it, char const* first)
{
    while(it != first)
    {
        auto const c = it[-1];
        if(c != ' ' && c != '\t')
            break;
        --it;
    }
    return it;
}

bool
basic_parser_base::
is_pathchar(char c)
{
    // VFALCO This looks the same as the one below...

    // TEXT = <any OCTET except CTLs, and excluding LWS>
    static bool constexpr tab[256] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   0
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //  16
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //  32
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //  48
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //  64
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //  80
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, //  96
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 112
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 128
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 144
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 160
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 176
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 192
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 208
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 224
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1  // 240
    };
    return tab[static_cast<unsigned char>(c)];
}

bool
basic_parser_base::
unhex(unsigned char& d, char c)
{
    static signed char constexpr tab[256] = {
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, //   0
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, //  16
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, //  32
            0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1, //  48
        -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1, //  64
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, //  80
        -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1, //  96
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 112

        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 128
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 144
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 160
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 176
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 192
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 208
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, // 224
        -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1  // 240
    };
    d = static_cast<unsigned char>(
        tab[static_cast<unsigned char>(c)]);
    return d != static_cast<unsigned char>(-1);
}

//--------------------------------------------------------------------------

std::pair<char const*, bool>
basic_parser_base::
find_fast(
    char const* buf,
    char const* buf_end,
    char const* ranges,
    size_t ranges_size)
{
    bool found = false;
    boost::ignore_unused(buf_end, ranges, ranges_size);
    return {buf, found};
}

// VFALCO Can SIMD help this?
char const*
basic_parser_base::
find_eol(
    char const* it, char const* last,
        error_code& ec)
{
    for(;;)
    {
        if(it == last)
        {
            ec = {};
            return nullptr;
        }
        if(*it == '\r')
        {
            if(++it == last)
            {
                ec = {};
                return nullptr;
            }
            if(*it != '\n')
            {
                BOOST_BEAST_ASSIGN_EC(ec, error::bad_line_ending);
                return nullptr;
            }
            ec = {};
            return ++it;
        }
        // VFALCO Should we handle the legacy case
        // for lines terminated with a single '\n'?
        ++it;
    }
}

bool
basic_parser_base::
parse_dec(
    string_view s,
    std::uint64_t& v)
{
    char const* it = s.data();
    char const* last = it + s.size();
    if(it == last)
        return false;
    std::uint64_t tmp = 0;
    do
    {
        if((! is_digit(*it)) ||
            tmp > (std::numeric_limits<std::uint64_t>::max)() / 10)
            return false;
        tmp *= 10;
        std::uint64_t const d = *it - '0';
        if((std::numeric_limits<std::uint64_t>::max)() - tmp < d)
            return false;
        tmp += d;
    }
    while(++it != last);
    v = tmp;
    return true;
}

bool
basic_parser_base::
parse_hex(char const*& it, std::uint64_t& v)
{
    unsigned char d;
    if(! unhex(d, *it))
        return false;
    std::uint64_t tmp = 0;
    do
    {
        if(tmp > (std::numeric_limits<std::uint64_t>::max)() / 16)
            return false;
        tmp *= 16;
        if((std::numeric_limits<std::uint64_t>::max)() - tmp < d)
            return false;
        tmp += d;
    }
    while(unhex(d, *++it));
    v = tmp;
    return true;
}

char const*
basic_parser_base::
find_eom(char const* p, char const* last)
{
    for(;;)
    {
        if(p + 4 > last)
            return nullptr;
        if(p[3] != '\n')
        {
            if(p[3] == '\r')
                ++p;
            else
                p += 4;
        }
        else if(p[2] != '\r')
        {
            p += 4;
        }
        else if(p[1] != '\n')
        {
            p += 2;
        }
        else if(p[0] != '\r')
        {
            p += 2;
        }
        else
        {
            return p + 4;
        }
    }
}

//--------------------------------------------------------------------------

char const*
basic_parser_base::
parse_token_to_eol(
    char const* p,
    char const* last,
    char const*& token_last,
    error_code& ec)
{
    for(;; ++p)
    {
        if(p >= last)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
            return p;
        }
        if(BOOST_UNLIKELY(! is_print(*p)))
            if((BOOST_LIKELY(static_cast<
                    unsigned char>(*p) < '\040') &&
                BOOST_LIKELY(*p != 9)) ||
                BOOST_UNLIKELY(*p == 127))
                goto found_control;
    }
found_control:
    if(BOOST_LIKELY(*p == '\r'))
    {
        if(++p >= last)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
            return last;
        }
        if(*p++ != '\n')
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::bad_line_ending);
            return last;
        }
        token_last = p - 2;
    }
#if 0
    // VFALCO This allows `\n` by itself
    //        to terminate a line
    else if(*p == '\n')
    {
        token_last = p;
        ++p;
    }
#endif
    else
    {
        // invalid character
        return nullptr;
    }
    return p;
}

bool
basic_parser_base::
parse_crlf(char const*& it)
{
    if( it[0] != '\r' || it[1] != '\n')
        return false;
    it += 2;
    return true;
}

void
basic_parser_base::
parse_method(
    char const*& it, char const* last,
    string_view& result, error_code& ec)
{
    // parse token SP
    auto const first = it;
    for(;; ++it)
    {
        if(it + 1 > last)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
            return;
        }
        if(! detail::is_token_char(*it))
            break;
    }
    if(it + 1 > last)
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
        return;
    }
    if(*it != ' ')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_method);
        return;
    }
    if(it == first)
    {
        // cannot be empty
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_method);
        return;
    }
    result = make_string(first, it++);
}

void
basic_parser_base::
parse_target(
    char const*& it, char const* last,
    string_view& result, error_code& ec)
{
    // parse target SP
    auto const first = it;
    for(;; ++it)
    {
        if(it + 1 > last)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
            return;
        }
        if(! is_pathchar(*it))
            break;
    }
    if(it + 1 > last)
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
        return;
    }
    if(*it != ' ')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_target);
        return;
    }
    if(it == first)
    {
        // cannot be empty
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_target);
        return;
    }
    result = make_string(first, it++);
}

void
basic_parser_base::
parse_version(
    char const*& it, char const* last,
    int& result, error_code& ec)
{
    if(it + 8 > last)
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
        return;
    }
    if(*it++ != 'H')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_version);
        return;
    }
    if(*it++ != 'T')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_version);
        return;
    }
    if(*it++ != 'T')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_version);
        return;
    }
    if(*it++ != 'P')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_version);
        return;
    }
    if(*it++ != '/')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_version);
        return;
    }
    if(! is_digit(*it))
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_version);
        return;
    }
    result = 10 * (*it++ - '0');
    if(*it++ != '.')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_version);
        return;
    }
    if(! is_digit(*it))
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_version);
        return;
    }
    result += *it++ - '0';
}

void
basic_parser_base::
parse_status(
    char const*& it, char const* last,
    unsigned short& result, error_code& ec)
{
    // parse 3(digit) SP
    if(it + 4 > last)
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
        return;
    }
    if(! is_digit(*it))
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_status);
        return;
    }
    result = 100 * (*it++ - '0');
    if(! is_digit(*it))
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_status);
        return;
    }
    result += 10 * (*it++ - '0');
    if(! is_digit(*it))
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_status);
        return;
    }
    result += *it++ - '0';
    if(*it++ != ' ')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_status);
        return;
    }
}

void
basic_parser_base::
parse_reason(
    char const*& it, char const* last,
    string_view& result, error_code& ec)
{
    auto const first = it;
    char const* token_last = nullptr;
    auto p = parse_token_to_eol(
        it, last, token_last, ec);
    if(ec)
        return;
    if(! p)
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_reason);
        return;
    }
    result = make_string(first, token_last);
    it = p;
}

void
basic_parser_base::
parse_field(
    char const*& p,
    char const* last,
    string_view& name,
    string_view& value,
    beast::detail::char_buffer<max_obs_fold>& buf,
    error_code& ec)
{
/*  header-field    = field-name ":" OWS field-value OWS

    field-name      = token
    field-value     = *( field-content / obs-fold )
    field-content   = field-vchar [ 1*( SP / HTAB ) field-vchar ]
    field-vchar     = VCHAR / obs-text

    obs-fold        = CRLF 1*( SP / HTAB )
                    ; obsolete line folding
                    ; see Section 3.2.4

    token           = 1*<any CHAR except CTLs or separators>
    CHAR            = <any US-ASCII character (octets 0 - 127)>
    sep             = "(" | ")" | "<" | ">" | "@"
                    | "," | ";" | ":" | "\" | <">
                    | "/" | "[" | "]" | "?" | "="
                    | "{" | "}" | SP | HT
*/
    static char const* is_token =
        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
        "\0\1\0\1\1\1\1\1\0\0\1\1\0\1\1\0\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0"
        "\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\1\1"
        "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\1\0\1\0"
        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
        "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";

    // name
    BOOST_ALIGNMENT(16) static const char ranges1[] =
        "\x00 "  /* control chars and up to SP */
        "\"\""   /* 0x22 */
        "()"     /* 0x28,0x29 */
        ",,"     /* 0x2c */
        "//"     /* 0x2f */
        ":@"     /* 0x3a-0x40 */
        "[]"     /* 0x5b-0x5d */
        "{\377"; /* 0x7b-0xff */
    auto first = p;
    bool found;
    std::tie(p, found) = find_fast(
        p, last, ranges1, sizeof(ranges1)-1);
    if(! found && p >= last)
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
        return;
    }
    for(;;)
    {
        if(*p == ':')
            break;
        if(! is_token[static_cast<
            unsigned char>(*p)])
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::bad_field);
            return;
        }
        ++p;
        if(p >= last)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
            return;
        }
    }
    if(p == first)
    {
        // empty name
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_field);
        return;
    }
    name = make_string(first, p);
    ++p; // eat ':'
    char const* token_last = nullptr;
    for(;;)
    {
        // eat leading ' ' and '\t'
        for(;;++p)
        {
            if(p + 1 > last)
            {
                BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
                return;
            }
            if(! (*p == ' ' || *p == '\t'))
                break;
        }
        // parse to CRLF
        first = p;
        p = parse_token_to_eol(p, last, token_last, ec);
        if(ec)
            return;
        if(! p)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::bad_value);
            return;
        }
        // Look 1 char past the CRLF to handle obs-fold.
        if(p + 1 > last)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
            return;
        }
        token_last =
            trim_back(token_last, first);
        if(*p != ' ' && *p != '\t')
        {
            value = make_string(first, token_last);
            return;
        }
        ++p;
        if(token_last != first)
            break;
    }
    buf.clear();
    if (!buf.try_append(first, token_last))
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::header_limit);
        return;
    }

    BOOST_ASSERT(! buf.empty());
    for(;;)
    {
        // eat leading ' ' and '\t'
        for(;;++p)
        {
            if(p + 1 > last)
            {
                BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
                return;
            }
            if(! (*p == ' ' || *p == '\t'))
                break;
        }
        // parse to CRLF
        first = p;
        p = parse_token_to_eol(p, last, token_last, ec);
        if(ec)
            return;
        if(! p)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::bad_value);
            return;
        }
        // Look 1 char past the CRLF to handle obs-fold.
        if(p + 1 > last)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
            return;
        }
        token_last = trim_back(token_last, first);
        if(first != token_last)
        {
            if (!buf.try_push_back(' ') ||
                !buf.try_append(first, token_last))
            {
                BOOST_BEAST_ASSIGN_EC(ec, error::header_limit);
                return;
            }
        }
        if(*p != ' ' && *p != '\t')
        {
            value = {buf.data(), buf.size()};
            return;
        }
        ++p;
    }
}


void
basic_parser_base::
parse_chunk_extensions(
    char const*& it,
    char const* last,
    error_code& ec)
{
/*
    chunk-ext       = *( BWS  ";" BWS chunk-ext-name [ BWS  "=" BWS chunk-ext-val ] )
    BWS             = *( SP / HTAB ) ; "Bad White Space"
    chunk-ext-name  = token
    chunk-ext-val   = token / quoted-string
    token           = 1*tchar
    quoted-string   = DQUOTE *( qdtext / quoted-pair ) DQUOTE
    qdtext          = HTAB / SP / "!" / %x23-5B ; '#'-'[' / %x5D-7E ; ']'-'~' / obs-text
    quoted-pair     = "\" ( HTAB / SP / VCHAR / obs-text )
    obs-text        = %x80-FF

    https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4667
*/
loop:
    if(it == last)
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
        return;
    }
    if(*it != ' ' && *it != '\t' && *it != ';')
        return;
    // BWS
    if(*it == ' ' || *it == '\t')
    {
        for(;;)
        {
            ++it;
            if(it == last)
            {
                BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
                return;
            }
            if(*it != ' ' && *it != '\t')
                break;
        }
    }
    // ';'
    if(*it != ';')
    {
        BOOST_BEAST_ASSIGN_EC(ec, error::bad_chunk_extension);
        return;
    }
semi:
    ++it; // skip ';'
    // BWS
    for(;;)
    {
        if(it == last)
        {
            BOOST_BEAST_ASSIGN_EC(ec, error::need_more);
            return;
        }
        if(*it != ' ' && *it != '\t')
            break;
        ++it;
    }
    // chunk-ext-name
    if(! detail::is_token_char(*it))
    {
        ec = error::bad_chunk_extension;
        return;
    }
    for(;;)
    {
        ++it;
        if(it == last)
        {
            ec = error::need_more;
            return;
        }
        if(! detail::is_token_char(*it))
            break;
    }
    // BWS [ ";" / "=" ]
    {
        bool bws;
        if(*it == ' ' || *it == '\t')
        {
            for(;;)
            {
                ++it;
                if(it == last)
                {
                    ec = error::need_more;
                    return;
                }
                if(*it != ' ' && *it != '\t')
                    break;
            }
            bws = true;
        }
        else
        {
            bws = false;
        }
        if(*it == ';')
            goto semi;
        if(*it != '=')
        {
            if(bws)
                ec = error::bad_chunk_extension;
            return;
        }
        ++it; // skip '='
    }
    // BWS
    for(;;)
    {
        if(it == last)
        {
            ec = error::need_more;
            return;
        }
        if(*it != ' ' && *it != '\t')
            break;
        ++it;
    }
    // chunk-ext-val
    if(*it != '"')
    {
        // token
        if(! detail::is_token_char(*it))
        {
            ec = error::bad_chunk_extension;
            return;
        }
        for(;;)
        {
            ++it;
            if(it == last)
            {
                ec = error::need_more;
                return;
            }
            if(! detail::is_token_char(*it))
                break;
        }
    }
    else
    {
        // quoted-string
        for(;;)
        {
            ++it;
            if(it == last)
            {
                ec = error::need_more;
                return;
            }
            if(*it == '"')
                break;
            if(*it == '\\')
            {
                ++it;
                if(it == last)
                {
                    ec = error::need_more;
                    return;
                }
            }
        }
        ++it;
    }
    goto loop;
}

} // detail
} // http
} // beast
} // boost

#endif