[squid-dev] [PATCH] Parser-NG: Http1::Tokenizer
Amos Jeffries
squid3 at treenet.co.nz
Tue Jun 9 14:50:38 UTC 2015
This adds a class Http1::Tokenizer, which inherits from
::Parser::Tokenizer and presents additional HTTP-specific token parsing
methods.
At present it extends for the quoted-string, 1#( token/ quoted-string ),
and qdtext constructs from RFC 7230 and RFC 1945.
It can also cope with charset and quoted-pair escaping differences in
qdtext between RFC 1945 and RFC 7230. The un-escaped form of token is
returned.
Amos
-------------- next part --------------
=== modified file 'src/http/one/Makefile.am'
--- src/http/one/Makefile.am 2015-06-01 21:41:37 +0000
+++ src/http/one/Makefile.am 2015-06-09 01:57:52 +0000
@@ -2,21 +2,23 @@
##
## Squid software is distributed under GPLv2+ license and includes
## contributions from numerous individuals and organizations.
## Please see the COPYING and CONTRIBUTORS files for details.
##
include $(top_srcdir)/src/Common.am
include $(top_srcdir)/src/TestHeaders.am
noinst_LTLIBRARIES = libhttp1.la
libhttp1_la_SOURCES = \
forward.h \
Parser.cc \
Parser.h \
RequestParser.cc \
RequestParser.h \
ResponseParser.cc \
ResponseParser.h \
TeChunkedParser.cc \
- TeChunkedParser.h
+ TeChunkedParser.h \
+ Tokenizer.cc \
+ Tokenizer.h
=== modified file 'src/http/one/Parser.cc'
--- src/http/one/Parser.cc 2015-03-05 10:19:47 +0000
+++ src/http/one/Parser.cc 2015-04-10 09:05:02 +0000
@@ -1,49 +1,49 @@
/*
* Copyright (C) 1996-2015 The Squid Software Foundation and contributors
*
* Squid software is distributed under GPLv2+ license and includes
* contributions from numerous individuals and organizations.
* Please see the COPYING and CONTRIBUTORS files for details.
*/
#include "squid.h"
#include "Debug.h"
#include "http/one/Parser.h"
+#include "http/one/Tokenizer.h"
#include "mime_header.h"
-#include "parser/Tokenizer.h"
#include "SquidConfig.h"
/// RFC 7230 section 2.6 - 7 magic octets
const SBuf Http::One::Parser::Http1magic("HTTP/1.");
void
Http::One::Parser::clear()
{
parsingStage_ = HTTP_PARSE_NONE;
buf_ = NULL;
msgProtocol_ = AnyP::ProtocolVersion();
mimeHeaderBlock_.clear();
}
bool
-Http::One::Parser::skipLineTerminator(::Parser::Tokenizer &tok) const
+Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
{
static const SBuf crlf("\r\n");
if (tok.skip(crlf))
return true;
if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
return true;
return false;
}
bool
Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
{
// MIME headers block exist in (only) HTTP/1.x and ICY
const bool expectMime = (msgProtocol_.protocol == AnyP::PROTO_HTTP && msgProtocol_.major == 1) ||
msgProtocol_.protocol == AnyP::PROTO_ICY;
if (expectMime) {
/* NOTE: HTTP/0.9 messages do not have a mime header block.
@@ -85,59 +85,59 @@
}
// arbitrary maximum-length for headers which can be found by Http1Parser::getHeaderField()
#define GET_HDR_SZ 1024
// BUG: returns only the first header line with given name,
// ignores multi-line headers and obs-fold headers
char *
Http::One::Parser::getHeaderField(const char *name)
{
if (!headerBlockSize() || !name)
return NULL;
LOCAL_ARRAY(char, header, GET_HDR_SZ);
const int namelen = strlen(name);
debugs(25, 5, "looking for " << name);
// while we can find more LF in the SBuf
static CharacterSet iso8859Line = CharacterSet("non-LF",'\0','\n'-1) + CharacterSet(NULL, '\n'+1, (unsigned char)0xFF);
- ::Parser::Tokenizer tok(mimeHeaderBlock_);
+ Http1::Tokenizer tok(mimeHeaderBlock_);
SBuf p;
static const SBuf crlf("\r\n");
while (tok.prefix(p, iso8859Line)) {
if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
break; // error. reached invalid octet or end of buffer insted of an LF ??
// header lines must start with the name (case insensitive)
if (p.substr(0, namelen).caseCmp(name, namelen))
continue;
// then a COLON
if (p[namelen] != ':')
continue;
// drop any trailing *CR sequence
p.trim(crlf, false, true);
debugs(25, 5, "checking " << p);
p.consume(namelen + 1);
// TODO: optimize SBuf::trim to take CharacterSet directly
- ::Parser::Tokenizer t(p);
+ Http1::Tokenizer t(p);
t.skipAll(CharacterSet::WSP);
p = t.remaining();
// prevent buffer overrun on char header[];
p.chop(0, sizeof(header)-1);
// return the header field-value
xstrncpy(header, p.rawContent(), p.length()+1);
debugs(25, 5, "returning " << header);
return header;
}
return NULL;
}
=== modified file 'src/http/one/Parser.h'
--- src/http/one/Parser.h 2015-03-29 14:11:36 +0000
+++ src/http/one/Parser.h 2015-06-09 01:57:21 +0000
@@ -1,40 +1,36 @@
/*
* Copyright (C) 1996-2015 The Squid Software Foundation and contributors
*
* Squid software is distributed under GPLv2+ license and includes
* contributions from numerous individuals and organizations.
* Please see the COPYING and CONTRIBUTORS files for details.
*/
#ifndef _SQUID_SRC_HTTP_ONE_PARSER_H
#define _SQUID_SRC_HTTP_ONE_PARSER_H
#include "anyp/ProtocolVersion.h"
#include "http/one/forward.h"
#include "http/StatusCode.h"
#include "SBuf.h"
-namespace Parser {
-class Tokenizer;
-}
-
namespace Http {
namespace One {
// Parser states
enum ParseState {
HTTP_PARSE_NONE, ///< initialized, but nothing usefully parsed yet
HTTP_PARSE_FIRST, ///< HTTP/1 message first-line
HTTP_PARSE_CHUNK_SZ, ///< HTTP/1.1 chunked encoding chunk-size
HTTP_PARSE_CHUNK_EXT, ///< HTTP/1.1 chunked encoding chunk-ext
HTTP_PARSE_CHUNK, ///< HTTP/1.1 chunked encoding chunk-data
HTTP_PARSE_MIME, ///< HTTP/1 mime-header block
HTTP_PARSE_DONE ///< parsed a message header, or reached a terminal syntax error
};
/** HTTP/1.x protocol parser
*
* Works on a raw character I/O buffer and tokenizes the content into
* the major CRLF delimited segments of an HTTP/1 procotol message:
*
* \item first-line (request-line / simple-request / status-line)
@@ -91,41 +87,41 @@
* \return A pointer to a field-value of the first matching field-name, or NULL.
*/
char *getHeaderField(const char *name);
/// the remaining unprocessed section of buffer
const SBuf &remaining() const {return buf_;}
/**
* HTTP status code resulting from the parse process.
* to be used on the invalid message handling.
*
* Http::scNone indicates incomplete parse,
* Http::scOkay indicates no error,
* other codes represent a parse error.
*/
Http::StatusCode parseStatusCode;
protected:
/// detect and skip the CRLF or (if tolerant) LF line terminator
/// consume from the tokenizer and return true only if found
- bool skipLineTerminator(::Parser::Tokenizer &tok) const;
+ bool skipLineTerminator(Http1::Tokenizer &tok) const;
/**
* Scan to find the mime headers block for current message.
*
* \retval true If mime block (or a blocks non-existence) has been
* identified accurately within limit characters.
* mimeHeaderBlock_ has been updated and buf_ consumed.
*
* \retval false An error occured, or no mime terminator found within limit.
*/
bool grabMimeBlock(const char *which, const size_t limit);
/// RFC 7230 section 2.6 - 7 magic octets
static const SBuf Http1magic;
/// bytes remaining to be parsed
SBuf buf_;
/// what stage the parser is currently up to
ParseState parsingStage_;
=== modified file 'src/http/one/RequestParser.cc'
--- src/http/one/RequestParser.cc 2015-02-20 03:25:12 +0000
+++ src/http/one/RequestParser.cc 2015-04-10 09:05:05 +0000
@@ -1,33 +1,33 @@
/*
* Copyright (C) 1996-2015 The Squid Software Foundation and contributors
*
* Squid software is distributed under GPLv2+ license and includes
* contributions from numerous individuals and organizations.
* Please see the COPYING and CONTRIBUTORS files for details.
*/
#include "squid.h"
#include "Debug.h"
#include "http/one/RequestParser.h"
+#include "http/one/Tokenizer.h"
#include "http/ProtocolVersion.h"
-#include "parser/Tokenizer.h"
#include "profiler/Profiler.h"
#include "SquidConfig.h"
Http::One::RequestParser::RequestParser() :
Parser(),
firstLineGarbage_(0)
{}
Http1::Parser::size_type
Http::One::RequestParser::firstLineSize() const
{
// RFC 7230 section 2.6
/* method SP request-target SP "HTTP/" DIGIT "." DIGIT CRLF */
return method_.image().length() + uri_.length() + 12;
}
/**
* Attempt to parse the first line of a new request message.
*
* Governed by RFC 7230 section 3.5
@@ -55,41 +55,41 @@
}
}
}
/**
* Attempt to parse the method field out of an HTTP message request-line.
*
* Governed by:
* RFC 1945 section 5.1
* RFC 7230 section 2.6, 3.1 and 3.5
*
* Parsing state is stored between calls. The current implementation uses
* checkpoints after each successful request-line field.
* The return value tells you whether the parsing is completed or not.
*
* \retval -1 an error occurred. parseStatusCode indicates HTTP status result.
* \retval 1 successful parse. method_ is filled and buffer consumed including first delimiter.
* \retval 0 more data is needed to complete the parse
*/
int
-Http::One::RequestParser::parseMethodField(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
+Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok, const CharacterSet &WspDelim)
{
// scan for up to 16 valid method characters.
static const size_t maxMethodLength = 16; // TODO: make this configurable?
// method field is a sequence of TCHAR.
SBuf methodFound;
if (tok.prefix(methodFound, CharacterSet::TCHAR, maxMethodLength) && tok.skipOne(WspDelim)) {
method_ = HttpRequestMethod(methodFound);
buf_ = tok.remaining(); // incremental parse checkpoint
return 1;
} else if (tok.atEnd()) {
debugs(74, 5, "Parser needs more data to find method");
return 0;
} // else error(s)
// non-delimiter found after accepted method bytes means ...
if (methodFound.length() == maxMethodLength) {
@@ -115,41 +115,41 @@
* "
* A URI is composed from a limited set of characters consisting of
* digits, letters, and a few graphic symbols.
* "
*/
// RFC 3986 section 2.1 - percent encoding "%" HEXDIG
UriChars.add('%');
UriChars += CharacterSet::HEXDIG;
// RFC 3986 section 2.2 - reserved characters
UriChars += CharacterSet("gen-delims", ":/?#[]@");
UriChars += CharacterSet("sub-delims", "!$&'()*+,;=");
// RFC 3986 section 2.3 - unreserved characters
UriChars += CharacterSet::ALPHA;
UriChars += CharacterSet::DIGIT;
UriChars += CharacterSet("unreserved", "-._~");
return UriChars;
}
int
-Http::One::RequestParser::parseUriField(::Parser::Tokenizer &tok)
+Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok)
{
// URI field is a sequence of ... what? segments all have different valid charset
// go with non-whitespace non-binary characters for now
static CharacterSet UriChars = uriValidCharacters();
/* Arbitrary 64KB URI upper length limit.
*
* Not quite as arbitrary as it seems though. Old SquidString objects
* cannot store strings larger than 64KB, so we must limit until they
* have all been replaced with SBuf.
*
* Not that it matters but RFC 7230 section 3.1.1 requires (RECOMMENDED)
* at least 8000 octets for the whole line, including method and version.
*/
const size_t maxUriLength = min(static_cast<size_t>(Config.maxRequestHeaderSize) - firstLineSize(),
static_cast<size_t>((64*1024)-1));
SBuf uriFound;
// RFC 7230 HTTP/1.x URI are followed by at least one whitespace delimiter
@@ -170,41 +170,41 @@
} else if (tok.atEnd()) {
debugs(74, 5, "Parser needs more data to find URI");
return 0;
}
// else errors...
if (uriFound.length() == maxUriLength) {
// RFC 7230 section 3.1.1 mandatory (MUST) 414 response
parseStatusCode = Http::scUriTooLong;
debugs(33, 5, "invalid request-line. URI longer than " << maxUriLength << " bytes");
} else {
// RFC 7230 section 3.1.1 required (SHOULD) 400 response
parseStatusCode = Http::scBadRequest;
debugs(33, 5, "invalid request-line. missing URI delimiter");
}
return -1;
}
int
-Http::One::RequestParser::parseHttpVersionField(::Parser::Tokenizer &tok)
+Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok)
{
// partial match of HTTP/1 magic prefix
if (tok.remaining().length() < Http1magic.length() && Http1magic.startsWith(tok.remaining())) {
debugs(74, 5, "Parser needs more data to find version");
return 0;
}
if (!tok.skip(Http1magic)) {
debugs(74, 5, "invalid request-line. not HTTP/1 protocol");
parseStatusCode = Http::scHttpVersionNotSupported;
return -1;
}
if (tok.atEnd()) {
debugs(74, 5, "Parser needs more data to find version");
return 0;
}
// get the version minor DIGIT
SBuf digit;
@@ -229,41 +229,41 @@
}
/**
* Attempt to parse the first line of a new request message.
*
* Governed by:
* RFC 1945 section 5.1
* RFC 7230 section 2.6, 3.1 and 3.5
*
* Parsing state is stored between calls. The current implementation uses
* checkpoints after each successful request-line field.
* The return value tells you whether the parsing is completed or not.
*
* \retval -1 an error occurred. parseStatusCode indicates HTTP status result.
* \retval 1 successful parse. member fields contain the request-line items
* \retval 0 more data is needed to complete the parse
*/
int
Http::One::RequestParser::parseRequestFirstLine()
{
- ::Parser::Tokenizer tok(buf_);
+ Http1::Tokenizer tok(buf_);
debugs(74, 5, "parsing possible request: buf.length=" << buf_.length());
debugs(74, DBG_DATA, buf_);
// NP: would be static, except it need to change with reconfigure
CharacterSet WspDelim = CharacterSet::SP; // strict parse only accepts SP
if (Config.onoff.relaxed_header_parser) {
// RFC 7230 section 3.5
// tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C), or bare CR
// as whitespace between request-line fields
WspDelim += CharacterSet::HTAB
+ CharacterSet("VT,FF","\x0B\x0C")
+ CharacterSet::CR;
}
// only search for method if we have not yet found one
if (method_ == Http::METHOD_NONE) {
const int res = parseMethodField(tok, WspDelim);
if (res < 1)
@@ -280,41 +280,41 @@
}
}
if (tok.atEnd()) {
debugs(74, 5, "Parser needs more data");
return 0;
}
// from here on, we have two possible parse paths: whitespace tolerant, and strict
if (Config.onoff.relaxed_header_parser) {
// whitespace tolerant
// NOTES:
// * this would be static, except WspDelim changes with reconfigure
// * HTTP-version charset is included by uriValidCharacters()
// * terminal CR is included by WspDelim here in relaxed parsing
CharacterSet LfDelim = uriValidCharacters() + WspDelim;
// seek the LF character, then tokenize the line in reverse
SBuf line;
if (tok.prefix(line, LfDelim) && tok.skip('\n')) {
- ::Parser::Tokenizer rTok(line);
+ Http1::Tokenizer rTok(line);
SBuf nil;
(void)rTok.suffix(nil,CharacterSet::CR); // optional CR in terminator
SBuf digit;
if (rTok.suffix(digit,CharacterSet::DIGIT) && rTok.skipSuffix(Http1magic) && rTok.suffix(nil,WspDelim)) {
uri_ = rTok.remaining();
msgProtocol_ = Http::ProtocolVersion(1, (*digit.rawContent() - '0'));
if (uri_.isEmpty()) {
debugs(33, 5, "invalid request-line. missing URL");
parseStatusCode = Http::scBadRequest;
return -1;
}
parseStatusCode = Http::scOkay;
buf_ = tok.remaining(); // incremental parse checkpoint
return 1;
} else if (method_ == Http::METHOD_GET) {
// RFC 1945 - for GET the line terminator may follow URL instead of a delimiter
debugs(33, 5, "HTTP/0.9 syntax request-line detected");
msgProtocol_ = Http::ProtocolVersion(0,9);
=== modified file 'src/http/one/RequestParser.h'
--- src/http/one/RequestParser.h 2015-02-20 03:25:12 +0000
+++ src/http/one/RequestParser.h 2015-04-10 09:05:06 +0000
@@ -30,40 +30,40 @@
class RequestParser : public Http1::Parser
{
public:
RequestParser();
virtual ~RequestParser() {}
/* Http::One::Parser API */
virtual void clear() {*this = RequestParser();}
virtual Http1::Parser::size_type firstLineSize() const;
virtual bool parse(const SBuf &aBuf);
/// the HTTP method if this is a request message
const HttpRequestMethod & method() const {return method_;}
/// the request-line URI if this is a request message, or an empty string.
const SBuf &requestUri() const {return uri_;}
private:
void skipGarbageLines();
int parseRequestFirstLine();
- int parseMethodField(::Parser::Tokenizer &, const CharacterSet &);
- int parseUriField(::Parser::Tokenizer &);
- int parseHttpVersionField(::Parser::Tokenizer &);
+ int parseMethodField(Http1::Tokenizer &, const CharacterSet &);
+ int parseUriField(Http1::Tokenizer &);
+ int parseHttpVersionField(Http1::Tokenizer &);
/// what request method has been found on the first line
HttpRequestMethod method_;
/// raw copy of the original client request-line URI field
SBuf uri_;
/// amount of garbage bytes tolerantly skipped inside the request-line
/// may be -1 if sender only omitted CR on terminator
int64_t firstLineGarbage_;
};
} // namespace One
} // namespace Http
#endif /* _SQUID_SRC_HTTP_ONE_REQUESTPARSER_H */
=== modified file 'src/http/one/ResponseParser.cc'
--- src/http/one/ResponseParser.cc 2015-03-01 08:37:07 +0000
+++ src/http/one/ResponseParser.cc 2015-04-10 09:05:08 +0000
@@ -1,70 +1,70 @@
/*
* Copyright (C) 1996-2015 The Squid Software Foundation and contributors
*
* Squid software is distributed under GPLv2+ license and includes
* contributions from numerous individuals and organizations.
* Please see the COPYING and CONTRIBUTORS files for details.
*/
#include "squid.h"
#include "Debug.h"
#include "http/one/ResponseParser.h"
+#include "http/one/Tokenizer.h"
#include "http/ProtocolVersion.h"
-#include "parser/Tokenizer.h"
#include "profiler/Profiler.h"
#include "SquidConfig.h"
const SBuf Http::One::ResponseParser::IcyMagic("ICY ");
Http1::Parser::size_type
Http::One::ResponseParser::firstLineSize() const
{
Http1::Parser::size_type result = 0;
switch (msgProtocol_.protocol)
{
case AnyP::PROTO_HTTP:
result += Http1magic.length();
break;
case AnyP::PROTO_ICY:
result += IcyMagic.length();
break;
default: // no other protocols supported
return result;
}
// NP: the parser does not accept >2 DIGIT for version numbers
if (msgProtocol_.minor > 9)
result += 2;
else
result += 1;
result += 5; /* 5 octets in: SP status SP */
result += reasonPhrase_.length();
result += 2; /* CRLF terminator */
return result;
}
// NP: we found the protocol version and consumed it already.
// just need the status code and reason phrase
int
-Http::One::ResponseParser::parseResponseStatusAndReason(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
+Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, const CharacterSet &WspDelim)
{
if (!completedStatus_) {
debugs(74, 9, "seek status-code in: " << tok.remaining().substr(0,10) << "...");
/* RFC 7230 section 3.1.2 - status code is 3 DIGIT octets.
* There is no limit on what those octets may be.
* 000 through 999 are all valid.
*/
int64_t statusValue;
if (tok.int64(statusValue, 10, false, 3) && tok.skipOne(WspDelim)) {
debugs(74, 6, "found int64 status-code=" << statusValue);
statusCode_ = static_cast<Http::StatusCode>(statusValue);
buf_ = tok.remaining(); // resume checkpoint
completedStatus_ = true;
} else if (tok.atEnd()) {
debugs(74, 6, "Parser needs more data");
return 0; // need more to be sure we have it all
@@ -104,41 +104,41 @@
}
/**
* Attempt to parse the method field out of an HTTP message status-line.
*
* Governed by:
* RFC 1945 section 6.1
* RFC 7230 section 2.6, 3.1 and 3.5
*
* Parsing state is stored between calls. The current implementation uses
* checkpoints after each successful status-line field.
* The return value tells you whether the parsing is completed or not.
*
* \retval -1 an error occurred.
* \retval 1 successful parse. statusCode_ and maybe reasonPhrase_ are filled and buffer consumed including first delimiter.
* \retval 0 more data is needed to complete the parse
*/
int
Http::One::ResponseParser::parseResponseFirstLine()
{
- ::Parser::Tokenizer tok(buf_);
+ Http1::Tokenizer tok(buf_);
CharacterSet WspDelim = CharacterSet::SP; // strict parse only accepts SP
if (Config.onoff.relaxed_header_parser) {
// RFC 7230 section 3.5
// tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C), or bare CR
// as whitespace between status-line fields
WspDelim += CharacterSet::HTAB
+ CharacterSet("VT,FF","\x0B\x0C")
+ CharacterSet::CR;
}
if (msgProtocol_.protocol != AnyP::PROTO_NONE) {
debugs(74, 6, "continue incremental parse for " << msgProtocol_);
debugs(74, DBG_DATA, "parse remaining buf={length=" << tok.remaining().length() << ", data='" << tok.remaining() << "'}");
// we already found the magic, but not the full line. keep going.
return parseResponseStatusAndReason(tok, WspDelim);
} else if (tok.skip(Http1magic)) {
debugs(74, 6, "found prefix magic " << Http1magic);
=== modified file 'src/http/one/ResponseParser.h'
--- src/http/one/ResponseParser.h 2015-03-05 10:00:37 +0000
+++ src/http/one/ResponseParser.h 2015-04-10 09:05:09 +0000
@@ -26,41 +26,41 @@
* \item status-line (version SP status SP reash-phrase)
* \item mime-header (set of RFC2616 syntax header fields)
*/
class ResponseParser : public Http1::Parser
{
public:
ResponseParser() : Parser(), completedStatus_(false), statusCode_(Http::scNone) {}
virtual ~ResponseParser() {}
/* Http::One::Parser API */
virtual void clear() {*this=ResponseParser();}
virtual Http1::Parser::size_type firstLineSize() const;
virtual bool parse(const SBuf &aBuf);
/* respone specific fields, read-only */
Http::StatusCode messageStatus() const { return statusCode_;}
SBuf reasonPhrase() const { return reasonPhrase_;}
private:
int parseResponseFirstLine();
- int parseResponseStatusAndReason(::Parser::Tokenizer&, const CharacterSet &);
+ int parseResponseStatusAndReason(Http1::Tokenizer&, const CharacterSet &);
/// magic prefix for identifying ICY response messages
static const SBuf IcyMagic;
/// Whether we found the status code yet.
/// We cannot rely on status value because server may send "000".
bool completedStatus_;
/// HTTP/1 status-line status code
Http::StatusCode statusCode_;
/// HTTP/1 status-line reason phrase
SBuf reasonPhrase_;
};
} // namespace One
} // namespace Http
#endif /* _SQUID_SRC_HTTP_ONE_RESPONSEPARSER_H */
=== modified file 'src/http/one/TeChunkedParser.cc'
--- src/http/one/TeChunkedParser.cc 2015-06-01 21:41:37 +0000
+++ src/http/one/TeChunkedParser.cc 2015-06-09 02:08:47 +0000
@@ -1,214 +1,206 @@
/*
* Copyright (C) 1996-2015 The Squid Software Foundation and contributors
*
* Squid software is distributed under GPLv2+ license and includes
* contributions from numerous individuals and organizations.
* Please see the COPYING and CONTRIBUTORS files for details.
*/
#include "squid.h"
#include "base/TextException.h"
#include "Debug.h"
#include "http/one/TeChunkedParser.h"
+#include "http/one/Tokenizer.h"
#include "http/ProtocolVersion.h"
#include "MemBuf.h"
-#include "parser/Tokenizer.h"
#include "Parsing.h"
Http::One::TeChunkedParser::TeChunkedParser()
{
// chunked encoding only exists in HTTP/1.1
Http1::Parser::msgProtocol_ = Http::ProtocolVersion(1,1);
clear();
}
void
Http::One::TeChunkedParser::clear()
{
parsingStage_ = Http1::HTTP_PARSE_NONE;
buf_.clear();
theChunkSize = theLeftBodySize = 0;
theOut = NULL;
useOriginBody = -1;
}
bool
Http::One::TeChunkedParser::parse(const SBuf &aBuf)
{
buf_ = aBuf; // sync buffers first so calls to remaining() work properly if nothing done.
if (buf_.isEmpty()) // nothing to do (yet)
return false;
debugs(74, DBG_DATA, "Parse buf={length=" << aBuf.length() << ", data='" << aBuf << "'}");
Must(!buf_.isEmpty() && theOut);
if (parsingStage_ == Http1::HTTP_PARSE_NONE)
parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ;
- ::Parser::Tokenizer tok(buf_);
+ Http1::Tokenizer tok(buf_);
// loop for as many chunks as we can
// use do-while instead of while so that we can incrementally
// restart in the middle of a chunk/frame
do {
if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkExtension(tok, theChunkSize))
return false;
if (parsingStage_ == Http1::HTTP_PARSE_CHUNK && !parseChunkBody(tok))
return false;
if (parsingStage_ == Http1::HTTP_PARSE_MIME && !grabMimeBlock("Trailers", 64*1024 /* 64KB max */))
return false;
// loop for as many chunks as we can
} while (parsingStage_ == Http1::HTTP_PARSE_CHUNK_SZ && parseChunkSize(tok));
return !needsMoreData() && !needsMoreSpace();
}
bool
Http::One::TeChunkedParser::needsMoreSpace() const
{
assert(theOut);
return parsingStage_ == Http1::HTTP_PARSE_CHUNK && !theOut->hasPotentialSpace();
}
/// RFC 7230 section 4.1 chunk-size
bool
-Http::One::TeChunkedParser::parseChunkSize(::Parser::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok)
{
Must(theChunkSize <= 0); // Should(), really
int64_t size = -1;
if (tok.int64(size, 16, false) && !tok.atEnd()) {
if (size < 0)
throw TexcHere("negative chunk size");
theChunkSize = theLeftBodySize = size;
debugs(94,7, "found chunk: " << theChunkSize);
buf_ = tok.remaining(); // parse checkpoint
parsingStage_ = Http1::HTTP_PARSE_CHUNK_EXT;
return true;
} else if (tok.atEnd()) {
return false; // need more data
}
// else error
throw TexcHere("corrupted chunk size");
return false; // should not be reachable
}
/**
* Parses a set of RFC 7230 section 4.1.1 chunk-ext
* http://tools.ietf.org/html/rfc7230#section-4.1.1
*
* chunk-ext = *( ";" chunk-ext-name [ "=" chunk-ext-val ] )
* chunk-ext-name = token
* chunk-ext-val = token / quoted-string
*
* ICAP 'use-original-body=N' extension is supported.
*/
bool
-Http::One::TeChunkedParser::parseChunkExtension(::Parser::Tokenizer &tok, bool skipKnown)
+Http::One::TeChunkedParser::parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown)
{
- // TODO implement a proper quoted-string Tokenizer method
- static const CharacterSet qString = CharacterSet("qString","\"\r\n").add('\0').complement();
-
SBuf ext;
+ SBuf value;
while (tok.skip(';') && tok.prefix(ext, CharacterSet::TCHAR)) {
// whole value part is optional. if no '=' expect next chunk-ext
if (tok.skip('=')) {
if (!skipKnown) {
if (ext.cmp("use-original-body",17) == 0 && tok.int64(useOriginBody, 10)) {
debugs(94, 3, "Found chunk extension " << ext << "=" << useOriginBody);
buf_ = tok.remaining(); // parse checkpoint
continue;
}
}
debugs(94, 5, "skipping unknown chunk extension " << ext);
- // unknown might have a value token ...
- if (tok.skipAll(CharacterSet::TCHAR) && !tok.atEnd()) {
- buf_ = tok.remaining(); // parse checkpoint
- continue;
- }
-
- // ... or a quoted-string
- if (tok.skipOne(CharacterSet::DQUOTE) && tok.skipAll(qString) && tok.skipOne(CharacterSet::DQUOTE)) {
+ // unknown might have a value token or quoted-string
+ if (tok.quotedStringOrToken(value) && !tok.atEnd()) {
buf_ = tok.remaining(); // parse checkpoint
continue;
}
// otherwise need more data OR corrupt syntax
break;
}
if (!tok.atEnd())
buf_ = tok.remaining(); // parse checkpoint (unless there might be more token name)
}
if (tok.atEnd())
return false;
if (skipLineTerminator(tok)) {
buf_ = tok.remaining(); // checkpoint
// non-0 chunk means data, 0-size means optional Trailer follows
parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME;
return true;
}
throw TexcHere("corrupted chunk extension value");
return false;
}
bool
-Http::One::TeChunkedParser::parseChunkBody(::Parser::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok)
{
Must(theLeftBodySize > 0); // Should, really
buf_ = tok.remaining(); // sync buffers before buf_ use
// TODO fix type mismatches and casting for these
const size_t availSize = min(theLeftBodySize, (uint64_t)buf_.length());
const size_t safeSize = min(availSize, (size_t)theOut->potentialSpaceSize());
theOut->append(buf_.rawContent(), safeSize);
buf_.consume(safeSize);
theLeftBodySize -= safeSize;
tok.reset(buf_); // sync buffers after consume()
if (theLeftBodySize == 0)
return parseChunkEnd(tok);
else
Must(needsMoreData() || needsMoreSpace());
return true;
}
bool
-Http::One::TeChunkedParser::parseChunkEnd(::Parser::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkEnd(Http1::Tokenizer &tok)
{
Must(theLeftBodySize == 0); // Should(), really
if (skipLineTerminator(tok)) {
buf_ = tok.remaining(); // parse checkpoint
theChunkSize = 0; // done with the current chunk
parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ;
return true;
} else if (!tok.atEnd()) {
throw TexcHere("found data between chunk end and CRLF");
}
return false;
}
=== modified file 'src/http/one/TeChunkedParser.h'
--- src/http/one/TeChunkedParser.h 2015-06-01 21:41:37 +0000
+++ src/http/one/TeChunkedParser.h 2015-06-09 02:06:49 +0000
@@ -28,38 +28,38 @@
* Ignores chunk extensions except for ICAP's ieof.
* Trailers are available via mimeHeader() if wanted.
*/
class TeChunkedParser : public Http1::Parser
{
public:
TeChunkedParser();
virtual ~TeChunkedParser() {theOut=NULL;/* we dont own this object */}
/// set the buffer to be used to store decoded chunk data
void setPayloadBuffer(MemBuf *parsedContent) {theOut = parsedContent;}
bool needsMoreSpace() const;
/* Http1::Parser API */
virtual void clear();
virtual bool parse(const SBuf &);
virtual Parser::size_type firstLineSize() const {return 0;} // has no meaning with multiple chunks
private:
- bool parseChunkSize(::Parser::Tokenizer &tok);
- bool parseChunkExtension(::Parser::Tokenizer &tok, bool skipKnown);
- bool parseChunkBody(::Parser::Tokenizer &tok);
- bool parseChunkEnd(::Parser::Tokenizer &tok);
+ bool parseChunkSize(Http1::Tokenizer &tok);
+ bool parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown);
+ bool parseChunkBody(Http1::Tokenizer &tok);
+ bool parseChunkEnd(Http1::Tokenizer &tok);
MemBuf *theOut;
uint64_t theChunkSize;
uint64_t theLeftBodySize;
public:
int64_t useOriginBody;
};
} // namespace One
} // namespace Http
#endif /* SQUID_SRC_HTTP_ONE_TeChunkedParser_H */
=== added file 'src/http/one/Tokenizer.cc'
--- src/http/one/Tokenizer.cc 1970-01-01 00:00:00 +0000
+++ src/http/one/Tokenizer.cc 2015-06-09 14:45:44 +0000
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#include "squid.h"
+#include "Debug.h"
+#include "http/one/Tokenizer.h"
+
+bool
+Http::One::Tokenizer::quotedString(SBuf &returnedToken, const bool http1p0)
+{
+ checkpoint();
+
+ if (!skip('"'))
+ return false;
+
+ return qdText(returnedToken, http1p0);
+}
+
+bool
+Http::One::Tokenizer::quotedStringOrToken(SBuf &returnedToken, const bool http1p0)
+{
+ checkpoint();
+
+ if (!skip('"'))
+ return prefix(returnedToken, CharacterSet::TCHAR);
+
+ return qdText(returnedToken, http1p0);
+}
+
+bool
+Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0)
+{
+ // the initial DQUOTE has been skipped by the caller
+
+ /*
+ * RFC 1945 - defines qdtext:
+ * inclusive of LWS (which includes CR and LF)
+ * exclusive of 0x80-0xFF
+ * includes 0x5C ('\') as just a regular character
+ */
+ static const CharacterSet qdtext1p0 = CharacterSet("qdtext (HTTP/1.0)", 0x23, 0x7E) +
+ CharacterSet("", "!") +
+ CharacterSet::CR + CharacterSet::LF + CharacterSet::HTAB + CharacterSet::SP;
+ /*
+ * RFC 7230 - defines qdtext:
+ * exclusive of CR and LF
+ * inclusive of 0x80-0xFF
+ * includes 0x5C ('\') but only when part of quoted-pair
+ */
+ static const CharacterSet qdtext1p1 = CharacterSet("qdtext (HTTP/1.1)", 0x23, 0x5B) +
+ CharacterSet("", "!") +
+ CharacterSet("", 0x5D, 0x7E) +
+ CharacterSet::HTAB + CharacterSet::SP +
+ CharacterSet::OBSTEXT;
+
+ // best we can do is a conditional reference since http1p0 value may change per-client
+ const CharacterSet &tokenChars = (http1p0 ? qdtext1p0 : qdtext1p1);
+
+ for (;;) {
+ SBuf::size_type prefixLen = buf().findFirstNotOf(tokenChars);
+ returnedToken.append(consume(prefixLen));
+
+ // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not
+ if (!http1p0 && skip('\\')) {
+ /* RFC 7230 section 3.2.6
+ *
+ * The backslash octet ("\") can be used as a single-octet quoting
+ * mechanism within quoted-string and comment constructs. Recipients
+ * that process the value of a quoted-string MUST handle a quoted-pair
+ * as if it were replaced by the octet following the backslash.
+ *
+ * quoted-pair = "\" ( HTAB / SP / VCHAR / obs-text )
+ */
+ static const CharacterSet qPairChars = CharacterSet::HTAB + CharacterSet::SP + CharacterSet::VCHAR + CharacterSet::OBSTEXT;
+ SBuf escaped;
+ if (!prefix(escaped, qPairChars, 1)) {
+ returnedToken.clear();
+ restoreLastCheckpoint();
+ return false;
+ }
+ returnedToken.append(escaped);
+ continue;
+
+ } else if (skip('"')) {
+ break; // done
+
+ } else if (atEnd()) {
+ // need more data
+ returnedToken.clear();
+ restoreLastCheckpoint();
+ return false;
+ }
+
+ // else, we have an error
+ debugs(24, 8, "invalid bytes for set " << tokenChars.name);
+ returnedToken.clear();
+ restoreLastCheckpoint();
+ return false;
+ }
+
+ // found the whole string
+ return true;
+}
+
=== added file 'src/http/one/Tokenizer.h'
--- src/http/one/Tokenizer.h 1970-01-01 00:00:00 +0000
+++ src/http/one/Tokenizer.h 2015-04-10 10:58:57 +0000
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#ifndef SQUID_SRC_HTTP_ONE_TOKENIZER_H
+#define SQUID_SRC_HTTP_ONE_TOKENIZER_H
+
+#include "parser/Tokenizer.h"
+
+namespace Http {
+namespace One {
+
+/**
+ * Lexical processor extended to tokenize HTTP/1.x syntax.
+ *
+ * \see ::Parser::Tokenizer for more detail
+ */
+class Tokenizer : public ::Parser::Tokenizer
+{
+public:
+ Tokenizer(SBuf &s) : ::Parser::Tokenizer(s) {}
+
+ /**
+ * Attempt to parse a quoted-string lexical construct.
+ *
+ * Governed by:
+ * - RFC 1945 section 2.1
+ * "
+ * A string of text is parsed as a single word if it is quoted using
+ * double-quote marks.
+ *
+ * quoted-string = ( <"> *(qdtext) <"> )
+ *
+ * qdtext = <any CHAR except <"> and CTLs,
+ * but including LWS>
+ *
+ * Single-character quoting using the backslash ("\") character is not
+ * permitted in HTTP/1.0.
+ * "
+ *
+ * - RFC 7230 section 3.2.6
+ * "
+ * A string of text is parsed as a single value if it is quoted using
+ * double-quote marks.
+ *
+ * quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE
+ * qdtext = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text
+ * obs-text = %x80-FF
+ * "
+ *
+ * \param escaped HTTP/1.0 does not permit \-escaped characters
+ */
+ bool quotedString(SBuf &value, const bool http1p0 = false);
+
+ /**
+ * Attempt to parse a (token / quoted-string ) lexical construct.
+ */
+ bool quotedStringOrToken(SBuf &value, const bool http1p0 = false);
+
+private:
+ /// parse the internal component of a quote-string, and terminal DQUOTE
+ bool qdText(SBuf &value, const bool http1p0);
+
+ void checkpoint() { savedCheckpoint_ = buf(); savedStats_ = parsedSize(); }
+ void restoreLastCheckpoint() { undoParse(savedCheckpoint_, savedStats_); }
+
+ SBuf savedCheckpoint_;
+ SBuf::size_type savedStats_;
+};
+
+} // namespace One
+} // namespace Http
+
+#endif /* SQUID_SRC_HTTP_ONE_TOKENIZER_H */
+
=== modified file 'src/http/one/forward.h'
--- src/http/one/forward.h 2015-06-01 21:41:37 +0000
+++ src/http/one/forward.h 2015-06-09 01:57:21 +0000
@@ -1,34 +1,36 @@
/*
* Copyright (C) 1996-2015 The Squid Software Foundation and contributors
*
* Squid software is distributed under GPLv2+ license and includes
* contributions from numerous individuals and organizations.
* Please see the COPYING and CONTRIBUTORS files for details.
*/
#ifndef SQUID_SRC_HTTP_ONE_FORWARD_H
#define SQUID_SRC_HTTP_ONE_FORWARD_H
#include "base/RefCount.h"
namespace Http {
namespace One {
+class Tokenizer;
+
class Parser;
typedef RefCount<Http::One::Parser> ParserPointer;
class TeChunkedParser;
class RequestParser;
typedef RefCount<Http::One::RequestParser> RequestParserPointer;
class ResponseParser;
typedef RefCount<Http::One::ResponseParser> ResponseParserPointer;
} // namespace One
} // namespace Http
namespace Http1 = Http::One;
#endif /* SQUID_SRC_HTTP_ONE_FORWARD_H */
=== modified file 'src/parser/Tokenizer.h'
--- src/parser/Tokenizer.h 2015-02-20 03:25:12 +0000
+++ src/parser/Tokenizer.h 2015-04-10 09:09:30 +0000
@@ -27,41 +27,41 @@
* Methods returning false have no side-effects.
*/
class Tokenizer
{
public:
explicit Tokenizer(const SBuf &inBuf) : buf_(inBuf), parsed_(0) {}
/// yet unparsed data
SBuf buf() const { return buf_; }
/// number of parsed bytes, including skipped ones
SBuf::size_type parsedSize() const { return parsed_; }
/// whether the end of the buffer has been reached
bool atEnd() const { return buf_.isEmpty(); }
/// the remaining unprocessed section of buffer
const SBuf& remaining() const { return buf_; }
/// reinitialize processing for a new buffer
- void reset(const SBuf &newBuf) { buf_ = newBuf; parsed_ = 0; }
+ void reset(const SBuf &newBuf) { undoParse(newBuf, 0); }
/** Basic strtok(3):
* Skips all leading delimiters (if any),
* extracts all characters up to the next delimiter (a token), and
* skips all trailing delimiters (at least one must be present).
*
* Want to extract delimiters? Use prefix() instead.
*
* Note that Tokenizer cannot tell whether the trailing delimiters will
* continue when/if more input data becomes available later.
*
* \return true if found a non-empty token followed by a delimiter
*/
bool token(SBuf &returnedToken, const CharacterSet &delimiters);
/** Extracts all sequential permitted characters up to an optional length limit.
*
* Note that Tokenizer cannot tell whether the prefix will
* continue when/if more input data becomes available later.
*
@@ -118,29 +118,32 @@
/** Extracts an unsigned int64_t at the beginning of the buffer.
*
* strtoll(3)-alike function: tries to parse unsigned 64-bit integer
* at the beginning of the parse buffer, in the base specified by the user
* or guesstimated; consumes the parsed characters.
*
* \param result Output value. Not touched if parsing is unsuccessful.
* \param base Specify base to do the parsing in, with the same restrictions
* as strtoll. Defaults to 0 (meaning guess)
* \param allowSign Whether to accept a '+' or '-' sign prefix.
* \param limit Maximum count of characters to convert.
*
* \return whether the parsing was successful
*/
bool int64(int64_t &result, int base = 0, bool allowSign = true, SBuf::size_type limit = SBuf::npos);
protected:
SBuf consume(const SBuf::size_type n);
SBuf::size_type success(const SBuf::size_type n);
+ /// reset the buffer and parsed stats to a saved checkpoint
+ void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; }
+
private:
SBuf buf_; ///< yet unparsed input
SBuf::size_type parsed_; ///< bytes successfully parsed, including skipped
};
} /* namespace Parser */
#endif /* SQUID_PARSER_TOKENIZER_H_ */
More information about the squid-dev
mailing list