[squid-dev] [PATCH] mime unfolding

Wed May 11 11:32:06 UTC 2016

[taking this out of squid-bugs now that we are no longer focusing on the
security changes.]

To summarize for those not in the loop already. Squid does not yet do
obs-fold removal as required by RFC 7230.

There are three options that are compliant:
1) reject requests containing obs-fold with a 4xx error.
2) replace the fold CR*LF characters with a series of SP
3) replace the whole CR*LF WSP sequence with a single SP

Attached patch implements #3.

I've done some basic benchmarking of both options #2 and #3. The #2
unfold using an optimized reverse-iteration over the mime block is
consistently a little (1-4 RPS) faster than #3 using a Tokenizer and
SBuf::append().

But the savings is not enough to be bothered with AFAICS. Mime block
size also has a strong influence on the impact of each approach. The gap
between approaches rapidly becomes even smaller as the MemBlob
reallocation count reduces.

Amos

-------------- next part --------------
=== modified file 'src/http/one/Parser.cc'

--- src/http/one/Parser.cc	2016-04-12 18:12:15 +0000
+++ src/http/one/Parser.cc	2016-05-11 11:18:40 +0000
@@ -16,6 +16,12 @@
 /// RFC 7230 section 2.6 - 7 magic octets
 const SBuf Http::One::Parser::Http1magic("HTTP/1.");
 
+const SBuf &Http::One::CrLf()
+{
+    static const SBuf crlf("\r\n");
+    return crlf;
+}
+
 void
 Http::One::Parser::clear()
 {
@@ -25,11 +31,34 @@
     mimeHeaderBlock_.clear();
 }
 
+/// characters HTTP permits tolerant parsers to accept as delimiters
+static const CharacterSet &
+RelaxedDelimiterCharacters()
+{
+    // RFC 7230 section 3.5
+    // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
+    // or bare CR as whitespace between request-line fields
+    static const CharacterSet RelaxedDels =
+        (CharacterSet::SP +
+         CharacterSet::HTAB +
+         CharacterSet("VT,FF","\x0B\x0C") +
+         CharacterSet::CR).rename("relaxed-WSP");
+
+    return RelaxedDels;
+}
+
+/// characters used to separate HTTP fields
+const CharacterSet &
+Http::One::Parser::DelimiterCharacters()
+{
+    return Config.onoff.relaxed_header_parser ?
+           RelaxedDelimiterCharacters() : CharacterSet::SP;
+}
+
 bool
 Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
 {
-    static const SBuf crlf("\r\n");
-    if (tok.skip(crlf))
+    if (tok.skip(Http1::CrLf()))
         return true;
 
     if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
@@ -38,6 +67,72 @@
     return false;
 }
 
+/// all characters except the LF line terminator
+static const CharacterSet &
+LineCharacters()
+{
+    static const CharacterSet line = CharacterSet::LF.complement("non-LF");
+    return line;
+}
+
+/**
+ * Replace all obs-fold with an SP.
+ *
+ * NOTE 1: Also has the side effect of replacing all bare-LF with CRLF.
+ *
+ * RFC 7230 section 3.2.4
+ * "A server that receives an obs-fold in a request message that is not
+ *  within a message/http container MUST ... replace
+ *  each received obs-fold with one or more SP octets prior to
+ *  interpreting the field value or forwarding the message downstream."
+ *
+ * "A proxy or gateway that receives an obs-fold in a response message
+ *  that is not within a message/http container MUST ... replace each
+ *  received obs-fold with one or more SP octets prior to interpreting
+ *  the field value or forwarding the message downstream."
+ */
+void
+Http::One::Parser::unfoldMime()
+{
+    // Unfolding requires a CRLF terminator. Restore CRLF if needed.
+    if (mimeHeaderBlock_.length() < 2) {
+        if (mimeHeaderBlock_[0] == '\n')
+            mimeHeaderBlock_ = Http1::CrLf();
+        else
+            mimeHeaderBlock_.append(Http1::CrLf());
+    }
+    // now mimeHeaderBlock_ has 0+ fields followed by the CRLF terminator
+
+    static const CharacterSet nonCRLF = (CharacterSet::CR + CharacterSet::LF).complement().rename("non-CRLF");
+    Http1::Tokenizer tok(mimeHeaderBlock_);
+    mimeHeaderBlock_.clear();
+
+    do {
+        SBuf line;
+        if (tok.prefix(line, nonCRLF))
+            mimeHeaderBlock_.append(line);
+
+        // the skipAll() might erase bare-CR not part of a CR*LF
+        SBuf savePoint(tok.remaining());
+        auto crLen = tok.skipAll(CharacterSet::CR);
+
+        if (tok.skipOne(CharacterSet::LF)) {
+            if (tok.skipAll(CharacterSet::WSP))
+                mimeHeaderBlock_.append(' ');
+            else
+                mimeHeaderBlock_.append(Http1::CrLf());
+
+        } else if (crLen) {
+            // TODO: it might be better to replace the CR with one SP, but for now preserve
+            // preserve the bare-CR
+            tok.reset(savePoint);
+            if (tok.prefix(line, CharacterSet::CR))
+                mimeHeaderBlock_.append(line);
+        }
+
+    } while (!tok.atEnd());
+}
+
 bool
 Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
 {
@@ -51,8 +146,8 @@
          *       So the rest of the code will need to deal with '0'-byte headers
          *       (ie, none, so don't try parsing em)
          */
-        // XXX: c_str() reallocates. performance regression.
-        if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_.c_str(), buf_.length())) {
+        bool containsObsFold = false;
+        if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_, containsObsFold)) {
 
             // Squid could handle these headers, but admin does not want to
             if (firstLineSize() + mimeHeaderBytes >= limit) {
@@ -64,6 +159,36 @@
             }
 
             mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
+
+            /* RFC 7230 section 3:
+             * "A recipient that receives whitespace between the start-line and
+             * the first header field MUST ... consume each whitespace-preceded
+             * line without further processing of it."
+             *
+             * We need to always use the relaxed delimiters here to prevent
+             * line smuggling through strict parsers.
+             * Note that 'whitespace' in RFC 7230 includes CR. So that means
+             * sequences of CRLF will be pruned, but not sequences of bare-LF.
+             */
+            Http1::Tokenizer tok(mimeHeaderBlock_);
+            while (tok.skipOne(RelaxedDelimiterCharacters())) {
+                (void)tok.skipAll(LineCharacters()); // optional line content
+                // LF terminator is required.
+                // trust headersEnd() to ensure that we have at least one LF
+                (void)tok.skipOne(CharacterSet::LF);
+            }
+            // If mimeHeaderBlock_ had just whitespace line(s) followed by CRLF,
+            // then we skipped everything, including that terminating LF.
+            // Restore the terminating CRLF if needed.
+            if (tok.atEnd())
+                mimeHeaderBlock_ = Http1::CrLf();
+            else
+                mimeHeaderBlock_ = tok.remaining();
+            // now mimeHeaderBlock_ has 0+ fields followed by the LF terminator
+
+            if (containsObsFold)
+                unfoldMime();
+
             debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");
 
         } else { // headersEnd() == 0
@@ -102,12 +227,10 @@
     debugs(25, 5, "looking for " << name);
 
     // while we can find more LF in the SBuf
-    static CharacterSet iso8859Line = CharacterSet("non-LF",'\0','\n'-1) + CharacterSet(NULL, '\n'+1, (unsigned char)0xFF);
     Http1::Tokenizer tok(mimeHeaderBlock_);
     SBuf p;
-    static const SBuf crlf("\r\n");
 
-    while (tok.prefix(p, iso8859Line)) {
+    while (tok.prefix(p, LineCharacters())) {
         if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
             break; // error. reached invalid octet or end of buffer insted of an LF ??
 
@@ -120,7 +243,7 @@
             continue;
 
         // drop any trailing *CR sequence
-        p.trim(crlf, false, true);
+        p.trim(Http1::CrLf(), false, true);
 
         debugs(25, 5, "checking " << p);
         p.consume(namelen + 1);

=== modified file 'src/http/one/Parser.h'
--- src/http/one/Parser.h	2016-04-12 15:07:13 +0000
+++ src/http/one/Parser.h	2016-05-06 13:45:26 +0000
@@ -111,6 +111,10 @@
     /// consume from the tokenizer and return true only if found
     bool skipLineTerminator(Http1::Tokenizer &tok) const;
 
+    /// the characters which are to be considered valid whitespace
+    /// (WSP / BSP / OWS)
+    static const CharacterSet &DelimiterCharacters();
+
     /**
      * Scan to find the mime headers block for current message.
      *
@@ -139,6 +143,9 @@
 
     /// Whether the invalid HTTP as HTTP/0.9 hack expects a mime header block
     bool hackExpectsMime_;
+
+private:
+    void unfoldMime();
 };
 
 } // namespace One

=== modified file 'src/http/one/RequestParser.cc'
--- src/http/one/RequestParser.cc	2016-01-01 00:12:18 +0000
+++ src/http/one/RequestParser.cc	2016-05-06 13:45:26 +0000
@@ -114,30 +114,6 @@
     return UriChars;
 }
 
-/// characters HTTP permits tolerant parsers to accept as delimiters
-static const CharacterSet &
-RelaxedDelimiterCharacters()
-{
-    // RFC 7230 section 3.5
-    // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
-    // or bare CR as whitespace between request-line fields
-    static const CharacterSet RelaxedDels =
-        CharacterSet::SP +
-        CharacterSet::HTAB +
-        CharacterSet("VT,FF","\x0B\x0C") +
-        CharacterSet::CR;
-
-    return RelaxedDels;
-}
-
-/// characters used to separate HTTP fields
-const CharacterSet &
-Http::One::RequestParser::DelimiterCharacters()
-{
-    return Config.onoff.relaxed_header_parser ?
-           RelaxedDelimiterCharacters() : CharacterSet::SP;
-}
-
 /// characters which Squid will accept in the HTTP request-target (URI)
 const CharacterSet &
 Http::One::RequestParser::RequestTargetCharacters()

=== modified file 'src/http/one/RequestParser.h'
--- src/http/one/RequestParser.h	2016-01-01 00:12:18 +0000
+++ src/http/one/RequestParser.h	2016-05-06 13:45:26 +0000
@@ -56,7 +56,6 @@
     bool skipTrailingCrs(Http1::Tokenizer &tok);
 
     bool http0() const {return !msgProtocol_.major;}
-    static const CharacterSet &DelimiterCharacters();
     static const CharacterSet &RequestTargetCharacters();
 
     /// what request method has been found on the first line

=== modified file 'src/http/one/forward.h'
--- src/http/one/forward.h	2016-01-01 00:12:18 +0000
+++ src/http/one/forward.h	2016-05-06 13:45:26 +0000
@@ -10,6 +10,7 @@
 #define SQUID_SRC_HTTP_ONE_FORWARD_H
 
 #include "base/RefCount.h"
+#include "sbuf/forward.h"
 
 namespace Http {
 namespace One {
@@ -27,6 +28,9 @@
 class ResponseParser;
 typedef RefCount<Http::One::ResponseParser> ResponseParserPointer;
 
+/// CRLF textual representation
+const SBuf &CrLf();
+
 } // namespace One
 } // namespace Http
 

=== modified file 'src/mime_header.cc'
--- src/mime_header.cc	2016-01-01 00:12:18 +0000
+++ src/mime_header.cc	2016-05-08 10:00:42 +0000
@@ -13,7 +13,7 @@
 #include "profiler/Profiler.h"
 
 size_t
-headersEnd(const char *mime, size_t l)
+headersEnd(const char *mime, size_t l, bool &containsObsFold)
 {
     size_t e = 0;
     int state = 1;
@@ -35,7 +35,10 @@
                 state = 2;
             else if ('\n' == mime[e])
                 state = 3;
-            else
+            else if (' ' == mime[e] || '\t' == mime[e]) {
+                containsObsFold = true;
+                state = 0;
+            } else
                 state = 0;
 
             break;

=== modified file 'src/mime_header.h'
--- src/mime_header.h	2016-01-01 00:12:18 +0000
+++ src/mime_header.h	2016-05-10 12:20:08 +0000
@@ -11,7 +11,35 @@
 #ifndef SQUID_MIME_HEADER_H_
 #define SQUID_MIME_HEADER_H_
 
-size_t headersEnd(const char *, size_t);
+/**
+ * Scan for the end of mime header block.
+ *
+ * Which is one of the following octet patterns:
+ * - CRLF CRLF, or
+ * - CRLF LF, or
+ * - LF CRLF, or
+ * - LF LF
+ *
+ * Also detects whether a obf-fold pattern exists within the mime block
+ * - CR*LF (SP / HTAB)
+ *
+ * \param containsObsFold will be set to true if obs-fold pattern is found. Otherwise not changed.
+ */
+size_t headersEnd(const char *, size_t, bool &containsObsFold);
+
+inline size_t
+headersEnd(const SBuf &buf, bool &containsObsFold)
+{
+    return headersEnd(buf.rawContent(), buf.length(), containsObsFold);
+}
+
+/// \deprecated caller needs to be fixed to handle obs-fold
+inline size_t
+headersEnd(const char *buf, size_t sz)
+{
+    bool ignored;
+    return headersEnd(buf, sz, ignored);
+}
 
 #endif /* SQUID_MIME_HEADER_H_ */
 

=== modified file 'src/tests/stub_mime.cc'
--- src/tests/stub_mime.cc	2016-01-01 00:12:18 +0000
+++ src/tests/stub_mime.cc	2016-05-06 13:45:26 +0000
@@ -11,5 +11,5 @@
 #define STUB_API "mime.cc"
 #include "tests/STUB.h"
 
-size_t headersEnd(const char *mime, size_t l) STUB_RETVAL(0)
+size_t headersEnd(const char *, size_t, bool &) STUB_RETVAL(0)