[squid-dev] [PATCH] Parser-NG: Http1::Tokenizer

Amos Jeffries squid3 at treenet.co.nz
Tue Jun 9 14:50:38 UTC 2015


This adds a class Http1::Tokenizer, which inherits from
::Parser::Tokenizer and presents additional HTTP-specific token parsing
methods.

At present it extends for the quoted-string, 1#( token/ quoted-string ),
and qdtext constructs from RFC 7230 and RFC 1945.

It can also cope with charset and quoted-pair escaping differences in
qdtext between RFC 1945 and RFC 7230. The un-escaped form of token is
returned.

Amos
-------------- next part --------------
=== modified file 'src/http/one/Makefile.am'
--- src/http/one/Makefile.am	2015-06-01 21:41:37 +0000
+++ src/http/one/Makefile.am	2015-06-09 01:57:52 +0000
@@ -2,21 +2,23 @@
 ##
 ## Squid software is distributed under GPLv2+ license and includes
 ## contributions from numerous individuals and organizations.
 ## Please see the COPYING and CONTRIBUTORS files for details.
 ##
 
 include $(top_srcdir)/src/Common.am
 include $(top_srcdir)/src/TestHeaders.am
 
 noinst_LTLIBRARIES = libhttp1.la
 
 libhttp1_la_SOURCES = \
 	forward.h \
 	Parser.cc \
 	Parser.h \
 	RequestParser.cc \
 	RequestParser.h \
 	ResponseParser.cc \
 	ResponseParser.h \
 	TeChunkedParser.cc \
-	TeChunkedParser.h
+	TeChunkedParser.h \
+	Tokenizer.cc \
+	Tokenizer.h

=== modified file 'src/http/one/Parser.cc'
--- src/http/one/Parser.cc	2015-03-05 10:19:47 +0000
+++ src/http/one/Parser.cc	2015-04-10 09:05:02 +0000
@@ -1,49 +1,49 @@
 /*
  * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
  *
  * Squid software is distributed under GPLv2+ license and includes
  * contributions from numerous individuals and organizations.
  * Please see the COPYING and CONTRIBUTORS files for details.
  */
 
 #include "squid.h"
 #include "Debug.h"
 #include "http/one/Parser.h"
+#include "http/one/Tokenizer.h"
 #include "mime_header.h"
-#include "parser/Tokenizer.h"
 #include "SquidConfig.h"
 
 /// RFC 7230 section 2.6 - 7 magic octets
 const SBuf Http::One::Parser::Http1magic("HTTP/1.");
 
 void
 Http::One::Parser::clear()
 {
     parsingStage_ = HTTP_PARSE_NONE;
     buf_ = NULL;
     msgProtocol_ = AnyP::ProtocolVersion();
     mimeHeaderBlock_.clear();
 }
 
 bool
-Http::One::Parser::skipLineTerminator(::Parser::Tokenizer &tok) const
+Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
 {
     static const SBuf crlf("\r\n");
     if (tok.skip(crlf))
         return true;
 
     if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
         return true;
 
     return false;
 }
 
 bool
 Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
 {
     // MIME headers block exist in (only) HTTP/1.x and ICY
     const bool expectMime = (msgProtocol_.protocol == AnyP::PROTO_HTTP && msgProtocol_.major == 1) ||
                             msgProtocol_.protocol == AnyP::PROTO_ICY;
 
     if (expectMime) {
         /* NOTE: HTTP/0.9 messages do not have a mime header block.
@@ -85,59 +85,59 @@
 }
 
 // arbitrary maximum-length for headers which can be found by Http1Parser::getHeaderField()
 #define GET_HDR_SZ  1024
 
 // BUG: returns only the first header line with given name,
 //      ignores multi-line headers and obs-fold headers
 char *
 Http::One::Parser::getHeaderField(const char *name)
 {
     if (!headerBlockSize() || !name)
         return NULL;
 
     LOCAL_ARRAY(char, header, GET_HDR_SZ);
     const int namelen = strlen(name);
 
     debugs(25, 5, "looking for " << name);
 
     // while we can find more LF in the SBuf
     static CharacterSet iso8859Line = CharacterSet("non-LF",'\0','\n'-1) + CharacterSet(NULL, '\n'+1, (unsigned char)0xFF);
-    ::Parser::Tokenizer tok(mimeHeaderBlock_);
+    Http1::Tokenizer tok(mimeHeaderBlock_);
     SBuf p;
     static const SBuf crlf("\r\n");
 
     while (tok.prefix(p, iso8859Line)) {
         if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
             break; // error. reached invalid octet or end of buffer insted of an LF ??
 
         // header lines must start with the name (case insensitive)
         if (p.substr(0, namelen).caseCmp(name, namelen))
             continue;
 
         // then a COLON
         if (p[namelen] != ':')
             continue;
 
         // drop any trailing *CR sequence
         p.trim(crlf, false, true);
 
         debugs(25, 5, "checking " << p);
         p.consume(namelen + 1);
 
         // TODO: optimize SBuf::trim to take CharacterSet directly
-        ::Parser::Tokenizer t(p);
+        Http1::Tokenizer t(p);
         t.skipAll(CharacterSet::WSP);
         p = t.remaining();
 
         // prevent buffer overrun on char header[];
         p.chop(0, sizeof(header)-1);
 
         // return the header field-value
         xstrncpy(header, p.rawContent(), p.length()+1);
         debugs(25, 5, "returning " << header);
         return header;
     }
 
     return NULL;
 }
 

=== modified file 'src/http/one/Parser.h'
--- src/http/one/Parser.h	2015-03-29 14:11:36 +0000
+++ src/http/one/Parser.h	2015-06-09 01:57:21 +0000
@@ -1,40 +1,36 @@
 /*
  * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
  *
  * Squid software is distributed under GPLv2+ license and includes
  * contributions from numerous individuals and organizations.
  * Please see the COPYING and CONTRIBUTORS files for details.
  */
 
 #ifndef _SQUID_SRC_HTTP_ONE_PARSER_H
 #define _SQUID_SRC_HTTP_ONE_PARSER_H
 
 #include "anyp/ProtocolVersion.h"
 #include "http/one/forward.h"
 #include "http/StatusCode.h"
 #include "SBuf.h"
 
-namespace Parser {
-class Tokenizer;
-}
-
 namespace Http {
 namespace One {
 
 // Parser states
 enum ParseState {
     HTTP_PARSE_NONE,      ///< initialized, but nothing usefully parsed yet
     HTTP_PARSE_FIRST,     ///< HTTP/1 message first-line
     HTTP_PARSE_CHUNK_SZ,  ///< HTTP/1.1 chunked encoding chunk-size
     HTTP_PARSE_CHUNK_EXT, ///< HTTP/1.1 chunked encoding chunk-ext
     HTTP_PARSE_CHUNK,     ///< HTTP/1.1 chunked encoding chunk-data
     HTTP_PARSE_MIME,      ///< HTTP/1 mime-header block
     HTTP_PARSE_DONE       ///< parsed a message header, or reached a terminal syntax error
 };
 
 /** HTTP/1.x protocol parser
  *
  * Works on a raw character I/O buffer and tokenizes the content into
  * the major CRLF delimited segments of an HTTP/1 procotol message:
  *
  * \item first-line (request-line / simple-request / status-line)
@@ -91,41 +87,41 @@
      * \return A pointer to a field-value of the first matching field-name, or NULL.
      */
     char *getHeaderField(const char *name);
 
     /// the remaining unprocessed section of buffer
     const SBuf &remaining() const {return buf_;}
 
     /**
      * HTTP status code resulting from the parse process.
      * to be used on the invalid message handling.
      *
      * Http::scNone indicates incomplete parse,
      * Http::scOkay indicates no error,
      * other codes represent a parse error.
      */
     Http::StatusCode parseStatusCode;
 
 protected:
     /// detect and skip the CRLF or (if tolerant) LF line terminator
     /// consume from the tokenizer and return true only if found
-    bool skipLineTerminator(::Parser::Tokenizer &tok) const;
+    bool skipLineTerminator(Http1::Tokenizer &tok) const;
 
     /**
      * Scan to find the mime headers block for current message.
      *
      * \retval true   If mime block (or a blocks non-existence) has been
      *                identified accurately within limit characters.
      *                mimeHeaderBlock_ has been updated and buf_ consumed.
      *
      * \retval false  An error occured, or no mime terminator found within limit.
      */
     bool grabMimeBlock(const char *which, const size_t limit);
 
     /// RFC 7230 section 2.6 - 7 magic octets
     static const SBuf Http1magic;
 
     /// bytes remaining to be parsed
     SBuf buf_;
 
     /// what stage the parser is currently up to
     ParseState parsingStage_;

=== modified file 'src/http/one/RequestParser.cc'
--- src/http/one/RequestParser.cc	2015-02-20 03:25:12 +0000
+++ src/http/one/RequestParser.cc	2015-04-10 09:05:05 +0000
@@ -1,33 +1,33 @@
 /*
  * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
  *
  * Squid software is distributed under GPLv2+ license and includes
  * contributions from numerous individuals and organizations.
  * Please see the COPYING and CONTRIBUTORS files for details.
  */
 
 #include "squid.h"
 #include "Debug.h"
 #include "http/one/RequestParser.h"
+#include "http/one/Tokenizer.h"
 #include "http/ProtocolVersion.h"
-#include "parser/Tokenizer.h"
 #include "profiler/Profiler.h"
 #include "SquidConfig.h"
 
 Http::One::RequestParser::RequestParser() :
     Parser(),
     firstLineGarbage_(0)
 {}
 
 Http1::Parser::size_type
 Http::One::RequestParser::firstLineSize() const
 {
     // RFC 7230 section 2.6
     /* method SP request-target SP "HTTP/" DIGIT "." DIGIT CRLF */
     return method_.image().length() + uri_.length() + 12;
 }
 
 /**
  * Attempt to parse the first line of a new request message.
  *
  * Governed by RFC 7230 section 3.5
@@ -55,41 +55,41 @@
         }
     }
 }
 
 /**
  * Attempt to parse the method field out of an HTTP message request-line.
  *
  * Governed by:
  *  RFC 1945 section 5.1
  *  RFC 7230 section 2.6, 3.1 and 3.5
  *
  * Parsing state is stored between calls. The current implementation uses
  * checkpoints after each successful request-line field.
  * The return value tells you whether the parsing is completed or not.
  *
  * \retval -1  an error occurred. parseStatusCode indicates HTTP status result.
  * \retval  1  successful parse. method_ is filled and buffer consumed including first delimiter.
  * \retval  0  more data is needed to complete the parse
  */
 int
-Http::One::RequestParser::parseMethodField(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
+Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok, const CharacterSet &WspDelim)
 {
     // scan for up to 16 valid method characters.
     static const size_t maxMethodLength = 16; // TODO: make this configurable?
 
     // method field is a sequence of TCHAR.
     SBuf methodFound;
     if (tok.prefix(methodFound, CharacterSet::TCHAR, maxMethodLength) && tok.skipOne(WspDelim)) {
 
         method_ = HttpRequestMethod(methodFound);
         buf_ = tok.remaining(); // incremental parse checkpoint
         return 1;
 
     } else if (tok.atEnd()) {
         debugs(74, 5, "Parser needs more data to find method");
         return 0;
 
     } // else error(s)
 
     // non-delimiter found after accepted method bytes means ...
     if (methodFound.length() == maxMethodLength) {
@@ -115,41 +115,41 @@
      * "
      *   A URI is composed from a limited set of characters consisting of
      *   digits, letters, and a few graphic symbols.
      * "
      */
     // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
     UriChars.add('%');
     UriChars += CharacterSet::HEXDIG;
     // RFC 3986 section 2.2 - reserved characters
     UriChars += CharacterSet("gen-delims", ":/?#[]@");
     UriChars += CharacterSet("sub-delims", "!$&'()*+,;=");
     // RFC 3986 section 2.3 - unreserved characters
     UriChars += CharacterSet::ALPHA;
     UriChars += CharacterSet::DIGIT;
     UriChars += CharacterSet("unreserved", "-._~");
 
     return UriChars;
 }
 
 int
-Http::One::RequestParser::parseUriField(::Parser::Tokenizer &tok)
+Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok)
 {
     // URI field is a sequence of ... what? segments all have different valid charset
     // go with non-whitespace non-binary characters for now
     static CharacterSet UriChars = uriValidCharacters();
 
     /* Arbitrary 64KB URI upper length limit.
      *
      * Not quite as arbitrary as it seems though. Old SquidString objects
      * cannot store strings larger than 64KB, so we must limit until they
      * have all been replaced with SBuf.
      *
      * Not that it matters but RFC 7230 section 3.1.1 requires (RECOMMENDED)
      * at least 8000 octets for the whole line, including method and version.
      */
     const size_t maxUriLength = min(static_cast<size_t>(Config.maxRequestHeaderSize) - firstLineSize(),
                                     static_cast<size_t>((64*1024)-1));
 
     SBuf uriFound;
 
     // RFC 7230 HTTP/1.x URI are followed by at least one whitespace delimiter
@@ -170,41 +170,41 @@
     } else if (tok.atEnd()) {
         debugs(74, 5, "Parser needs more data to find URI");
         return 0;
     }
 
     // else errors...
 
     if (uriFound.length() == maxUriLength) {
         // RFC 7230 section 3.1.1 mandatory (MUST) 414 response
         parseStatusCode = Http::scUriTooLong;
         debugs(33, 5, "invalid request-line. URI longer than " << maxUriLength << " bytes");
     } else {
         // RFC 7230 section 3.1.1 required (SHOULD) 400 response
         parseStatusCode = Http::scBadRequest;
         debugs(33, 5, "invalid request-line. missing URI delimiter");
     }
     return -1;
 }
 
 int
-Http::One::RequestParser::parseHttpVersionField(::Parser::Tokenizer &tok)
+Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok)
 {
     // partial match of HTTP/1 magic prefix
     if (tok.remaining().length() < Http1magic.length() && Http1magic.startsWith(tok.remaining())) {
         debugs(74, 5, "Parser needs more data to find version");
         return 0;
     }
 
     if (!tok.skip(Http1magic)) {
         debugs(74, 5, "invalid request-line. not HTTP/1 protocol");
         parseStatusCode = Http::scHttpVersionNotSupported;
         return -1;
     }
 
     if (tok.atEnd()) {
         debugs(74, 5, "Parser needs more data to find version");
         return 0;
     }
 
     // get the version minor DIGIT
     SBuf digit;
@@ -229,41 +229,41 @@
 }
 
 /**
  * Attempt to parse the first line of a new request message.
  *
  * Governed by:
  *  RFC 1945 section 5.1
  *  RFC 7230 section 2.6, 3.1 and 3.5
  *
  * Parsing state is stored between calls. The current implementation uses
  * checkpoints after each successful request-line field.
  * The return value tells you whether the parsing is completed or not.
  *
  * \retval -1  an error occurred. parseStatusCode indicates HTTP status result.
  * \retval  1  successful parse. member fields contain the request-line items
  * \retval  0  more data is needed to complete the parse
  */
 int
 Http::One::RequestParser::parseRequestFirstLine()
 {
-    ::Parser::Tokenizer tok(buf_);
+    Http1::Tokenizer tok(buf_);
 
     debugs(74, 5, "parsing possible request: buf.length=" << buf_.length());
     debugs(74, DBG_DATA, buf_);
 
     // NP: would be static, except it need to change with reconfigure
     CharacterSet WspDelim = CharacterSet::SP; // strict parse only accepts SP
 
     if (Config.onoff.relaxed_header_parser) {
         // RFC 7230 section 3.5
         // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C), or bare CR
         // as whitespace between request-line fields
         WspDelim += CharacterSet::HTAB
                     + CharacterSet("VT,FF","\x0B\x0C")
                     + CharacterSet::CR;
     }
 
     // only search for method if we have not yet found one
     if (method_ == Http::METHOD_NONE) {
         const int res = parseMethodField(tok, WspDelim);
         if (res < 1)
@@ -280,41 +280,41 @@
         }
     }
     if (tok.atEnd()) {
         debugs(74, 5, "Parser needs more data");
         return 0;
     }
 
     // from here on, we have two possible parse paths: whitespace tolerant, and strict
     if (Config.onoff.relaxed_header_parser) {
         // whitespace tolerant
 
         // NOTES:
         // * this would be static, except WspDelim changes with reconfigure
         // * HTTP-version charset is included by uriValidCharacters()
         // * terminal CR is included by WspDelim here in relaxed parsing
         CharacterSet LfDelim = uriValidCharacters() + WspDelim;
 
         // seek the LF character, then tokenize the line in reverse
         SBuf line;
         if (tok.prefix(line, LfDelim) && tok.skip('\n')) {
-            ::Parser::Tokenizer rTok(line);
+            Http1::Tokenizer rTok(line);
             SBuf nil;
             (void)rTok.suffix(nil,CharacterSet::CR); // optional CR in terminator
             SBuf digit;
             if (rTok.suffix(digit,CharacterSet::DIGIT) && rTok.skipSuffix(Http1magic) && rTok.suffix(nil,WspDelim)) {
                 uri_ = rTok.remaining();
                 msgProtocol_ = Http::ProtocolVersion(1, (*digit.rawContent() - '0'));
                 if (uri_.isEmpty()) {
                     debugs(33, 5, "invalid request-line. missing URL");
                     parseStatusCode = Http::scBadRequest;
                     return -1;
                 }
 
                 parseStatusCode = Http::scOkay;
                 buf_ = tok.remaining(); // incremental parse checkpoint
                 return 1;
 
             } else if (method_ == Http::METHOD_GET) {
                 // RFC 1945 - for GET the line terminator may follow URL instead of a delimiter
                 debugs(33, 5, "HTTP/0.9 syntax request-line detected");
                 msgProtocol_ = Http::ProtocolVersion(0,9);

=== modified file 'src/http/one/RequestParser.h'
--- src/http/one/RequestParser.h	2015-02-20 03:25:12 +0000
+++ src/http/one/RequestParser.h	2015-04-10 09:05:06 +0000
@@ -30,40 +30,40 @@
 class RequestParser : public Http1::Parser
 {
 public:
     RequestParser();
     virtual ~RequestParser() {}
 
     /* Http::One::Parser API */
     virtual void clear() {*this = RequestParser();}
     virtual Http1::Parser::size_type firstLineSize() const;
     virtual bool parse(const SBuf &aBuf);
 
     /// the HTTP method if this is a request message
     const HttpRequestMethod & method() const {return method_;}
 
     /// the request-line URI if this is a request message, or an empty string.
     const SBuf &requestUri() const {return uri_;}
 
 private:
     void skipGarbageLines();
     int parseRequestFirstLine();
-    int parseMethodField(::Parser::Tokenizer &, const CharacterSet &);
-    int parseUriField(::Parser::Tokenizer &);
-    int parseHttpVersionField(::Parser::Tokenizer &);
+    int parseMethodField(Http1::Tokenizer &, const CharacterSet &);
+    int parseUriField(Http1::Tokenizer &);
+    int parseHttpVersionField(Http1::Tokenizer &);
 
     /// what request method has been found on the first line
     HttpRequestMethod method_;
 
     /// raw copy of the original client request-line URI field
     SBuf uri_;
 
     /// amount of garbage bytes tolerantly skipped inside the request-line
     /// may be -1 if sender only omitted CR on terminator
     int64_t firstLineGarbage_;
 };
 
 } // namespace One
 } // namespace Http
 
 #endif /*  _SQUID_SRC_HTTP_ONE_REQUESTPARSER_H */
 

=== modified file 'src/http/one/ResponseParser.cc'
--- src/http/one/ResponseParser.cc	2015-03-01 08:37:07 +0000
+++ src/http/one/ResponseParser.cc	2015-04-10 09:05:08 +0000
@@ -1,70 +1,70 @@
 /*
  * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
  *
  * Squid software is distributed under GPLv2+ license and includes
  * contributions from numerous individuals and organizations.
  * Please see the COPYING and CONTRIBUTORS files for details.
  */
 
 #include "squid.h"
 #include "Debug.h"
 #include "http/one/ResponseParser.h"
+#include "http/one/Tokenizer.h"
 #include "http/ProtocolVersion.h"
-#include "parser/Tokenizer.h"
 #include "profiler/Profiler.h"
 #include "SquidConfig.h"
 
 const SBuf Http::One::ResponseParser::IcyMagic("ICY ");
 
 Http1::Parser::size_type
 Http::One::ResponseParser::firstLineSize() const
 {
     Http1::Parser::size_type result = 0;
 
     switch (msgProtocol_.protocol)
     {
     case AnyP::PROTO_HTTP:
         result += Http1magic.length();
         break;
     case AnyP::PROTO_ICY:
         result += IcyMagic.length();
         break;
     default: // no other protocols supported
         return result;
     }
     // NP: the parser does not accept >2 DIGIT for version numbers
     if (msgProtocol_.minor > 9)
         result += 2;
     else
         result += 1;
 
     result += 5; /* 5 octets in: SP status SP */
     result += reasonPhrase_.length();
     result += 2; /* CRLF terminator */
     return result;
 }
 
 // NP: we found the protocol version and consumed it already.
 // just need the status code and reason phrase
 int
-Http::One::ResponseParser::parseResponseStatusAndReason(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
+Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, const CharacterSet &WspDelim)
 {
     if (!completedStatus_) {
         debugs(74, 9, "seek status-code in: " << tok.remaining().substr(0,10) << "...");
         /* RFC 7230 section 3.1.2 - status code is 3 DIGIT octets.
          * There is no limit on what those octets may be.
          * 000 through 999 are all valid.
          */
         int64_t statusValue;
         if (tok.int64(statusValue, 10, false, 3) && tok.skipOne(WspDelim)) {
 
             debugs(74, 6, "found int64 status-code=" << statusValue);
             statusCode_ = static_cast<Http::StatusCode>(statusValue);
 
             buf_ = tok.remaining(); // resume checkpoint
             completedStatus_ = true;
 
         } else if (tok.atEnd()) {
             debugs(74, 6, "Parser needs more data");
             return 0; // need more to be sure we have it all
 
@@ -104,41 +104,41 @@
 }
 
 /**
  * Attempt to parse the method field out of an HTTP message status-line.
  *
  * Governed by:
  *  RFC 1945 section 6.1
  *  RFC 7230 section 2.6, 3.1 and 3.5
  *
  * Parsing state is stored between calls. The current implementation uses
  * checkpoints after each successful status-line field.
  * The return value tells you whether the parsing is completed or not.
  *
  * \retval -1  an error occurred.
  * \retval  1  successful parse. statusCode_ and maybe reasonPhrase_ are filled and buffer consumed including first delimiter.
  * \retval  0  more data is needed to complete the parse
  */
 int
 Http::One::ResponseParser::parseResponseFirstLine()
 {
-    ::Parser::Tokenizer tok(buf_);
+    Http1::Tokenizer tok(buf_);
 
     CharacterSet WspDelim = CharacterSet::SP; // strict parse only accepts SP
 
     if (Config.onoff.relaxed_header_parser) {
         // RFC 7230 section 3.5
         // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C), or bare CR
         // as whitespace between status-line fields
         WspDelim += CharacterSet::HTAB
                     + CharacterSet("VT,FF","\x0B\x0C")
                     + CharacterSet::CR;
     }
 
     if (msgProtocol_.protocol != AnyP::PROTO_NONE) {
         debugs(74, 6, "continue incremental parse for " << msgProtocol_);
         debugs(74, DBG_DATA, "parse remaining buf={length=" << tok.remaining().length() << ", data='" << tok.remaining() << "'}");
         // we already found the magic, but not the full line. keep going.
         return parseResponseStatusAndReason(tok, WspDelim);
 
     } else if (tok.skip(Http1magic)) {
         debugs(74, 6, "found prefix magic " << Http1magic);

=== modified file 'src/http/one/ResponseParser.h'
--- src/http/one/ResponseParser.h	2015-03-05 10:00:37 +0000
+++ src/http/one/ResponseParser.h	2015-04-10 09:05:09 +0000
@@ -26,41 +26,41 @@
  * \item status-line (version SP status SP reash-phrase)
  * \item mime-header (set of RFC2616 syntax header fields)
  */
 class ResponseParser : public Http1::Parser
 {
 public:
     ResponseParser() : Parser(), completedStatus_(false), statusCode_(Http::scNone) {}
     virtual ~ResponseParser() {}
 
     /* Http::One::Parser API */
     virtual void clear() {*this=ResponseParser();}
     virtual Http1::Parser::size_type firstLineSize() const;
     virtual bool parse(const SBuf &aBuf);
 
     /* respone specific fields, read-only */
     Http::StatusCode messageStatus() const { return statusCode_;}
     SBuf reasonPhrase() const { return reasonPhrase_;}
 
 private:
     int parseResponseFirstLine();
-    int parseResponseStatusAndReason(::Parser::Tokenizer&, const CharacterSet &);
+    int parseResponseStatusAndReason(Http1::Tokenizer&, const CharacterSet &);
 
     /// magic prefix for identifying ICY response messages
     static const SBuf IcyMagic;
 
     /// Whether we found the status code yet.
     /// We cannot rely on status value because server may send "000".
     bool completedStatus_;
 
     /// HTTP/1 status-line status code
     Http::StatusCode statusCode_;
 
     /// HTTP/1 status-line reason phrase
     SBuf reasonPhrase_;
 };
 
 } // namespace One
 } // namespace Http
 
 #endif /* _SQUID_SRC_HTTP_ONE_RESPONSEPARSER_H */
 

=== modified file 'src/http/one/TeChunkedParser.cc'
--- src/http/one/TeChunkedParser.cc	2015-06-01 21:41:37 +0000
+++ src/http/one/TeChunkedParser.cc	2015-06-09 02:08:47 +0000
@@ -1,214 +1,206 @@
 /*
  * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
  *
  * Squid software is distributed under GPLv2+ license and includes
  * contributions from numerous individuals and organizations.
  * Please see the COPYING and CONTRIBUTORS files for details.
  */
 
 #include "squid.h"
 #include "base/TextException.h"
 #include "Debug.h"
 #include "http/one/TeChunkedParser.h"
+#include "http/one/Tokenizer.h"
 #include "http/ProtocolVersion.h"
 #include "MemBuf.h"
-#include "parser/Tokenizer.h"
 #include "Parsing.h"
 
 Http::One::TeChunkedParser::TeChunkedParser()
 {
     // chunked encoding only exists in HTTP/1.1
     Http1::Parser::msgProtocol_ = Http::ProtocolVersion(1,1);
 
     clear();
 }
 
 void
 Http::One::TeChunkedParser::clear()
 {
     parsingStage_ = Http1::HTTP_PARSE_NONE;
     buf_.clear();
     theChunkSize = theLeftBodySize = 0;
     theOut = NULL;
     useOriginBody = -1;
 }
 
 bool
 Http::One::TeChunkedParser::parse(const SBuf &aBuf)
 {
     buf_ = aBuf; // sync buffers first so calls to remaining() work properly if nothing done.
 
     if (buf_.isEmpty()) // nothing to do (yet)
         return false;
 
     debugs(74, DBG_DATA, "Parse buf={length=" << aBuf.length() << ", data='" << aBuf << "'}");
 
     Must(!buf_.isEmpty() && theOut);
 
     if (parsingStage_ == Http1::HTTP_PARSE_NONE)
         parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ;
 
-    ::Parser::Tokenizer tok(buf_);
+    Http1::Tokenizer tok(buf_);
 
     // loop for as many chunks as we can
     // use do-while instead of while so that we can incrementally
     // restart in the middle of a chunk/frame
     do {
 
         if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkExtension(tok, theChunkSize))
             return false;
 
         if (parsingStage_ == Http1::HTTP_PARSE_CHUNK && !parseChunkBody(tok))
             return false;
 
         if (parsingStage_ == Http1::HTTP_PARSE_MIME && !grabMimeBlock("Trailers", 64*1024 /* 64KB max */))
             return false;
 
         // loop for as many chunks as we can
     } while (parsingStage_ == Http1::HTTP_PARSE_CHUNK_SZ && parseChunkSize(tok));
 
     return !needsMoreData() && !needsMoreSpace();
 }
 
 bool
 Http::One::TeChunkedParser::needsMoreSpace() const
 {
     assert(theOut);
     return parsingStage_ == Http1::HTTP_PARSE_CHUNK && !theOut->hasPotentialSpace();
 }
 
 /// RFC 7230 section 4.1 chunk-size
 bool
-Http::One::TeChunkedParser::parseChunkSize(::Parser::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok)
 {
     Must(theChunkSize <= 0); // Should(), really
 
     int64_t size = -1;
     if (tok.int64(size, 16, false) && !tok.atEnd()) {
         if (size < 0)
             throw TexcHere("negative chunk size");
 
         theChunkSize = theLeftBodySize = size;
         debugs(94,7, "found chunk: " << theChunkSize);
         buf_ = tok.remaining(); // parse checkpoint
         parsingStage_ = Http1::HTTP_PARSE_CHUNK_EXT;
         return true;
 
     } else if (tok.atEnd()) {
         return false; // need more data
     }
 
     // else error
     throw TexcHere("corrupted chunk size");
     return false; // should not be reachable
 }
 
 /**
  * Parses a set of RFC 7230 section 4.1.1 chunk-ext
  * http://tools.ietf.org/html/rfc7230#section-4.1.1
  *
  *   chunk-ext      = *( ";" chunk-ext-name [ "=" chunk-ext-val ] )
  *   chunk-ext-name = token
  *   chunk-ext-val  = token / quoted-string
  *
  * ICAP 'use-original-body=N' extension is supported.
  */
 bool
-Http::One::TeChunkedParser::parseChunkExtension(::Parser::Tokenizer &tok, bool skipKnown)
+Http::One::TeChunkedParser::parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown)
 {
-    // TODO implement a proper quoted-string Tokenizer method
-    static const CharacterSet qString = CharacterSet("qString","\"\r\n").add('\0').complement();
-
     SBuf ext;
+    SBuf value;
     while (tok.skip(';') && tok.prefix(ext, CharacterSet::TCHAR)) {
 
         // whole value part is optional. if no '=' expect next chunk-ext
         if (tok.skip('=')) {
 
             if (!skipKnown) {
                 if (ext.cmp("use-original-body",17) == 0 && tok.int64(useOriginBody, 10)) {
                     debugs(94, 3, "Found chunk extension " << ext << "=" << useOriginBody);
                     buf_ = tok.remaining(); // parse checkpoint
                     continue;
                 }
             }
 
             debugs(94, 5, "skipping unknown chunk extension " << ext);
 
-            // unknown might have a value token ...
-            if (tok.skipAll(CharacterSet::TCHAR) && !tok.atEnd()) {
-                buf_ = tok.remaining(); // parse checkpoint
-                continue;
-            }
-
-            // ... or a quoted-string
-            if (tok.skipOne(CharacterSet::DQUOTE) && tok.skipAll(qString) && tok.skipOne(CharacterSet::DQUOTE)) {
+            // unknown might have a value token or quoted-string
+            if (tok.quotedStringOrToken(value) && !tok.atEnd()) {
                 buf_ = tok.remaining(); // parse checkpoint
                 continue;
             }
 
             // otherwise need more data OR corrupt syntax
             break;
         }
 
         if (!tok.atEnd())
             buf_ = tok.remaining(); // parse checkpoint (unless there might be more token name)
     }
 
     if (tok.atEnd())
         return false;
 
     if (skipLineTerminator(tok)) {
         buf_ = tok.remaining(); // checkpoint
         // non-0 chunk means data, 0-size means optional Trailer follows
         parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME;
         return true;
     }
 
     throw TexcHere("corrupted chunk extension value");
     return false;
 }
 
 bool
-Http::One::TeChunkedParser::parseChunkBody(::Parser::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok)
 {
     Must(theLeftBodySize > 0); // Should, really
 
     buf_ = tok.remaining(); // sync buffers before buf_ use
 
     // TODO fix type mismatches and casting for these
     const size_t availSize = min(theLeftBodySize, (uint64_t)buf_.length());
     const size_t safeSize = min(availSize, (size_t)theOut->potentialSpaceSize());
 
     theOut->append(buf_.rawContent(), safeSize);
     buf_.consume(safeSize);
     theLeftBodySize -= safeSize;
 
     tok.reset(buf_); // sync buffers after consume()
 
     if (theLeftBodySize == 0)
         return parseChunkEnd(tok);
     else
         Must(needsMoreData() || needsMoreSpace());
 
     return true;
 }
 
 bool
-Http::One::TeChunkedParser::parseChunkEnd(::Parser::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkEnd(Http1::Tokenizer &tok)
 {
     Must(theLeftBodySize == 0); // Should(), really
 
     if (skipLineTerminator(tok)) {
         buf_ = tok.remaining(); // parse checkpoint
         theChunkSize = 0; // done with the current chunk
         parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ;
         return true;
 
     } else if (!tok.atEnd()) {
         throw TexcHere("found data between chunk end and CRLF");
     }
 
     return false;
 }
 

=== modified file 'src/http/one/TeChunkedParser.h'
--- src/http/one/TeChunkedParser.h	2015-06-01 21:41:37 +0000
+++ src/http/one/TeChunkedParser.h	2015-06-09 02:06:49 +0000
@@ -28,38 +28,38 @@
  * Ignores chunk extensions except for ICAP's ieof.
  * Trailers are available via mimeHeader() if wanted.
  */
 class TeChunkedParser : public Http1::Parser
 {
 public:
     TeChunkedParser();
     virtual ~TeChunkedParser() {theOut=NULL;/* we dont own this object */}
 
     /// set the buffer to be used to store decoded chunk data
     void setPayloadBuffer(MemBuf *parsedContent) {theOut = parsedContent;}
 
     bool needsMoreSpace() const;
 
     /* Http1::Parser API */
     virtual void clear();
     virtual bool parse(const SBuf &);
     virtual Parser::size_type firstLineSize() const {return 0;} // has no meaning with multiple chunks
 
 private:
-    bool parseChunkSize(::Parser::Tokenizer &tok);
-    bool parseChunkExtension(::Parser::Tokenizer &tok, bool skipKnown);
-    bool parseChunkBody(::Parser::Tokenizer &tok);
-    bool parseChunkEnd(::Parser::Tokenizer &tok);
+    bool parseChunkSize(Http1::Tokenizer &tok);
+    bool parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown);
+    bool parseChunkBody(Http1::Tokenizer &tok);
+    bool parseChunkEnd(Http1::Tokenizer &tok);
 
     MemBuf *theOut;
     uint64_t theChunkSize;
     uint64_t theLeftBodySize;
 
 public:
     int64_t useOriginBody;
 };
 
 } // namespace One
 } // namespace Http
 
 #endif /* SQUID_SRC_HTTP_ONE_TeChunkedParser_H */
 

=== added file 'src/http/one/Tokenizer.cc'
--- src/http/one/Tokenizer.cc	1970-01-01 00:00:00 +0000
+++ src/http/one/Tokenizer.cc	2015-06-09 14:45:44 +0000
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#include "squid.h"
+#include "Debug.h"
+#include "http/one/Tokenizer.h"
+
+bool
+Http::One::Tokenizer::quotedString(SBuf &returnedToken, const bool http1p0)
+{
+    checkpoint();
+
+    if (!skip('"'))
+        return false;
+
+    return qdText(returnedToken, http1p0);
+}
+
+bool
+Http::One::Tokenizer::quotedStringOrToken(SBuf &returnedToken, const bool http1p0)
+{
+    checkpoint();
+
+    if (!skip('"'))
+        return prefix(returnedToken, CharacterSet::TCHAR);
+
+    return qdText(returnedToken, http1p0);
+}
+
+bool
+Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0)
+{
+    // the initial DQUOTE has been skipped by the caller
+
+    /*
+     * RFC 1945 - defines qdtext:
+     *   inclusive of LWS (which includes CR and LF)
+     *   exclusive of 0x80-0xFF
+     *   includes 0x5C ('\') as just a regular character
+     */
+    static const CharacterSet qdtext1p0 = CharacterSet("qdtext (HTTP/1.0)", 0x23, 0x7E) +
+                                          CharacterSet("", "!") +
+                                          CharacterSet::CR + CharacterSet::LF + CharacterSet::HTAB + CharacterSet::SP;
+    /*
+     * RFC 7230 - defines qdtext:
+     *   exclusive of CR and LF
+     *   inclusive of 0x80-0xFF
+     *   includes 0x5C ('\') but only when part of quoted-pair
+     */
+    static const CharacterSet qdtext1p1 = CharacterSet("qdtext (HTTP/1.1)", 0x23, 0x5B) +
+                                          CharacterSet("", "!") +
+                                          CharacterSet("", 0x5D, 0x7E) +
+                                          CharacterSet::HTAB + CharacterSet::SP +
+                                          CharacterSet::OBSTEXT;
+
+    // best we can do is a conditional reference since http1p0 value may change per-client
+    const CharacterSet &tokenChars = (http1p0 ? qdtext1p0 : qdtext1p1);
+
+    for (;;) {
+        SBuf::size_type prefixLen = buf().findFirstNotOf(tokenChars);
+        returnedToken.append(consume(prefixLen));
+
+        // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not
+        if (!http1p0 && skip('\\')) {
+            /* RFC 7230 section 3.2.6
+             *
+             * The backslash octet ("\") can be used as a single-octet quoting
+             * mechanism within quoted-string and comment constructs.  Recipients
+             * that process the value of a quoted-string MUST handle a quoted-pair
+             * as if it were replaced by the octet following the backslash.
+             *
+             *   quoted-pair    = "\" ( HTAB / SP / VCHAR / obs-text )
+             */
+            static const CharacterSet qPairChars = CharacterSet::HTAB + CharacterSet::SP + CharacterSet::VCHAR + CharacterSet::OBSTEXT;
+            SBuf escaped;
+            if (!prefix(escaped, qPairChars, 1)) {
+                returnedToken.clear();
+                restoreLastCheckpoint();
+                return false;
+            }
+            returnedToken.append(escaped);
+            continue;
+
+        } else if (skip('"')) {
+            break; // done
+
+        } else if (atEnd()) {
+            // need more data
+            returnedToken.clear();
+            restoreLastCheckpoint();
+            return false;
+        }
+
+        // else, we have an error
+        debugs(24, 8, "invalid bytes for set " << tokenChars.name);
+        returnedToken.clear();
+        restoreLastCheckpoint();
+        return false;
+    }
+
+    // found the whole string
+    return true;
+}
+

=== added file 'src/http/one/Tokenizer.h'
--- src/http/one/Tokenizer.h	1970-01-01 00:00:00 +0000
+++ src/http/one/Tokenizer.h	2015-04-10 10:58:57 +0000
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#ifndef SQUID_SRC_HTTP_ONE_TOKENIZER_H
+#define SQUID_SRC_HTTP_ONE_TOKENIZER_H
+
+#include "parser/Tokenizer.h"
+
+namespace Http {
+namespace One {
+
+/**
+ * Lexical processor extended to tokenize HTTP/1.x syntax.
+ *
+ * \see ::Parser::Tokenizer for more detail
+ */
+class Tokenizer : public ::Parser::Tokenizer
+{
+public:
+    Tokenizer(SBuf &s) : ::Parser::Tokenizer(s) {}
+
+    /**
+     * Attempt to parse a quoted-string lexical construct.
+     *
+     * Governed by:
+     *  - RFC 1945 section 2.1
+     *  "
+     *    A string of text is parsed as a single word if it is quoted using
+     *    double-quote marks.
+     *
+     *        quoted-string  = ( <"> *(qdtext) <"> )
+     *
+     *        qdtext         = <any CHAR except <"> and CTLs,
+     *                         but including LWS>
+     *
+     *    Single-character quoting using the backslash ("\") character is not
+     *    permitted in HTTP/1.0.
+     *  "
+     *
+     *  - RFC 7230 section 3.2.6
+     *  "
+     *    A string of text is parsed as a single value if it is quoted using
+     *    double-quote marks.
+     *
+     *    quoted-string  = DQUOTE *( qdtext / quoted-pair ) DQUOTE
+     *    qdtext         = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text
+     *    obs-text       = %x80-FF
+     *  "
+     *
+     * \param escaped HTTP/1.0 does not permit \-escaped characters
+     */
+    bool quotedString(SBuf &value, const bool http1p0 = false);
+
+    /**
+     * Attempt to parse a (token / quoted-string ) lexical construct.
+     */
+    bool quotedStringOrToken(SBuf &value, const bool http1p0 = false);
+
+private:
+    /// parse the internal component of a quote-string, and terminal DQUOTE
+    bool qdText(SBuf &value, const bool http1p0);
+
+    void checkpoint() { savedCheckpoint_ = buf(); savedStats_ = parsedSize(); }
+    void restoreLastCheckpoint() { undoParse(savedCheckpoint_, savedStats_); }
+
+    SBuf savedCheckpoint_;
+    SBuf::size_type savedStats_;
+};
+
+} // namespace One
+} // namespace Http
+
+#endif /* SQUID_SRC_HTTP_ONE_TOKENIZER_H */
+

=== modified file 'src/http/one/forward.h'
--- src/http/one/forward.h	2015-06-01 21:41:37 +0000
+++ src/http/one/forward.h	2015-06-09 01:57:21 +0000
@@ -1,34 +1,36 @@
 /*
  * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
  *
  * Squid software is distributed under GPLv2+ license and includes
  * contributions from numerous individuals and organizations.
  * Please see the COPYING and CONTRIBUTORS files for details.
  */
 
 #ifndef SQUID_SRC_HTTP_ONE_FORWARD_H
 #define SQUID_SRC_HTTP_ONE_FORWARD_H
 
 #include "base/RefCount.h"
 
 namespace Http {
 namespace One {
 
+class Tokenizer;
+
 class Parser;
 typedef RefCount<Http::One::Parser> ParserPointer;
 
 class TeChunkedParser;
 
 class RequestParser;
 typedef RefCount<Http::One::RequestParser> RequestParserPointer;
 
 class ResponseParser;
 typedef RefCount<Http::One::ResponseParser> ResponseParserPointer;
 
 } // namespace One
 } // namespace Http
 
 namespace Http1 = Http::One;
 
 #endif /* SQUID_SRC_HTTP_ONE_FORWARD_H */
 

=== modified file 'src/parser/Tokenizer.h'
--- src/parser/Tokenizer.h	2015-02-20 03:25:12 +0000
+++ src/parser/Tokenizer.h	2015-04-10 09:09:30 +0000
@@ -27,41 +27,41 @@
  * Methods returning false have no side-effects.
  */
 class Tokenizer
 {
 public:
     explicit Tokenizer(const SBuf &inBuf) : buf_(inBuf), parsed_(0) {}
 
     /// yet unparsed data
     SBuf buf() const { return buf_; }
 
     /// number of parsed bytes, including skipped ones
     SBuf::size_type parsedSize() const { return parsed_; }
 
     /// whether the end of the buffer has been reached
     bool atEnd() const { return buf_.isEmpty(); }
 
     /// the remaining unprocessed section of buffer
     const SBuf& remaining() const { return buf_; }
 
     /// reinitialize processing for a new buffer
-    void reset(const SBuf &newBuf) { buf_ = newBuf; parsed_ = 0; }
+    void reset(const SBuf &newBuf) { undoParse(newBuf, 0); }
 
     /** Basic strtok(3):
      *  Skips all leading delimiters (if any),
      *  extracts all characters up to the next delimiter (a token), and
      *  skips all trailing delimiters (at least one must be present).
      *
      *  Want to extract delimiters? Use prefix() instead.
      *
      *  Note that Tokenizer cannot tell whether the trailing delimiters will
      *  continue when/if more input data becomes available later.
      *
      * \return true if found a non-empty token followed by a delimiter
      */
     bool token(SBuf &returnedToken, const CharacterSet &delimiters);
 
     /** Extracts all sequential permitted characters up to an optional length limit.
      *
      *  Note that Tokenizer cannot tell whether the prefix will
      *  continue when/if more input data becomes available later.
      *
@@ -118,29 +118,32 @@
     /** Extracts an unsigned int64_t at the beginning of the buffer.
      *
      * strtoll(3)-alike function: tries to parse unsigned 64-bit integer
      * at the beginning of the parse buffer, in the base specified by the user
      * or guesstimated; consumes the parsed characters.
      *
      * \param result Output value. Not touched if parsing is unsuccessful.
      * \param base   Specify base to do the parsing in, with the same restrictions
      *               as strtoll. Defaults to 0 (meaning guess)
      * \param allowSign Whether to accept a '+' or '-' sign prefix.
      * \param limit  Maximum count of characters to convert.
      *
      * \return whether the parsing was successful
      */
     bool int64(int64_t &result, int base = 0, bool allowSign = true, SBuf::size_type limit = SBuf::npos);
 
 protected:
     SBuf consume(const SBuf::size_type n);
     SBuf::size_type success(const SBuf::size_type n);
 
+    /// reset the buffer and parsed stats to a saved checkpoint
+    void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; }
+
 private:
     SBuf buf_; ///< yet unparsed input
     SBuf::size_type parsed_; ///< bytes successfully parsed, including skipped
 };
 
 } /* namespace Parser */
 
 #endif /* SQUID_PARSER_TOKENIZER_H_ */
 



More information about the squid-dev mailing list