4 * This module contains decoding routines for converting
5 * quoted-printable data into pure 8-bit data, in MIME
8 * By Henrik Storner <storner@image.dk>
10 * Configuration file support for fetchmail 4.3.8 by
11 * Frank Damgaard <frda@post3.tele.dk>
13 * For license terms, see the file COPYING in this directory.
21 #include "fetchmail.h"
24 static unsigned char unhex(unsigned char c)
26 if ((c >= '0') && (c <= '9'))
28 else if ((c >= 'A') && (c <= 'F'))
29 return (c - 'A' + 10);
30 else if ((c >= 'a') && (c <= 'f'))
31 return (c - 'a' + 10);
33 return 16; /* invalid hex character */
36 static int qp_char(unsigned char c1, unsigned char c2, char *c_out)
41 if ((c1 > 15) || (c2 > 15))
51 * Routines to decode MIME QP-encoded headers, as per RFC 2047.
54 /* States of the decoding state machine */
55 #define S_COPY_PLAIN 0 /* Just copy, but watch for the QP flag */
56 #define S_SKIP_MIMEINIT 1 /* Get the encoding, and skip header */
57 #define S_COPY_MIME 2 /* Decode a sequence of coded characters */
59 static const char MIMEHDR_INIT[] = "=?"; /* Start of coded sequence */
60 static const char MIMEHDR_END[] = "?="; /* End of coded sequence */
62 void UnMimeHeader(char *hdr)
64 /* Decode a buffer containing data encoded according to RFC
65 * 2047. This only handles content-transfer-encoding; conversion
66 * between character sets is not implemented. In other words: We
67 * assume the charsets used can be displayed by your mail program
71 /* Note: Decoding is done "in-situ", i.e. without using an
72 * additional buffer for temp. storage. This is possible, since the
73 * decoded string will always be shorter than the encoded string,
74 * due to the encoding scheme.
77 int state = S_COPY_PLAIN;
78 char *p_in, *p_out, *p;
79 char enc = '\0'; /* initialization pacifies -Wall */
82 /* Speed up in case this is not a MIME-encoded header */
83 p = strstr(hdr, MIMEHDR_INIT);
85 return; /* No MIME header */
87 /* Loop through the buffer.
88 * p_in : Next char to be processed.
89 * p_out: Where to put the next processed char
90 * enc : Encoding used (usually, 'q' = quoted-printable)
92 for (p_out = p_in = hdr; (*p_in); ) {
95 p = strstr(p_in, MIMEHDR_INIT);
98 * No more coded data in buffer,
99 * just move remainder into place.
101 i = strlen(p_in); /* How much left */
102 memmove(p_out, p_in, i);
103 p_in += i; p_out += i;
106 /* MIME header init found at location p */
108 /* There are some uncoded chars at the beginning. */
110 memmove(p_out, p_in, i);
114 state = S_SKIP_MIMEINIT;
118 case S_SKIP_MIMEINIT:
119 /* Mime type definition: "charset?encoding?" */
120 p = strchr(p_in, '?');
122 /* p_in .. (p-1) holds the charset */
124 /* *(p+1) is the transfer encoding, *(p+2) must be a '?' */
126 enc = tolower((unsigned char)*(p+1));
131 state = S_COPY_PLAIN;
134 state = S_COPY_PLAIN; /* Invalid data */
138 p = strstr(p_in, MIMEHDR_END); /* Find end of coded data */
139 if (p == NULL) p = p_in + strlen(p_in);
140 for (; (p_in < p); ) {
141 /* Decode all encoded data */
144 /* Decode one char qp-coded at (p_in+1) and (p_in+2) */
145 if (qp_char(*(p_in+1), *(p_in+2), p_out) == 0)
148 /* Invalid QP data - pass through unchanged. */
153 else if (*p_in == '_') {
155 * RFC 2047: '_' inside encoded word represents 0x20.
156 * NOT a space - always the value 0x20.
168 else if (enc == 'b') {
169 /* Decode base64 encoded data */
173 delimsave = *p; *p = '\r';
174 decoded_count = from64tobits(p_out, p_in, 0);
176 if (decoded_count > 0)
177 p_out += decoded_count;
188 p_in += 2; /* Skip the MIMEHDR_END delimiter */
191 * We've completed decoding one encoded sequence. But another
192 * may follow immediately, in which case whitespace before the
193 * new MIMEHDR_INIT delimiter must be discarded.
194 * See if that is the case
196 p = strstr(p_in, MIMEHDR_INIT);
197 state = S_COPY_PLAIN;
200 * There is more MIME data later on. Is there
201 * whitespace only before the delimiter?
206 for (q=p_in; (wsp_only && (q < p)); q++)
207 wsp_only = isspace((unsigned char)*q);
211 * Whitespace-only before the MIME delimiter. OK,
212 * just advance p_in to past the new MIMEHDR_INIT,
213 * and prepare to process the new MIME charset/encoding
216 p_in = p + sizeof(MIMEHDR_INIT) - 1;
217 state = S_SKIP_MIMEINIT;
230 * Routines for decoding body-parts of a message.
232 * Since the "fetch" part of fetchmail gets a message body
233 * one line at a time, we need to maintain some state variables
234 * across multiple invokations of the UnMimeBodyline() routine.
235 * The driver routine should call MimeBodyType() when all
236 * headers have been received, and then UnMimeBodyline() for
237 * every line in the message body.
240 #define S_BODY_DATA 0
244 * Flag indicating if we are currently processing
245 * the headers or the body of a (multipart) message.
247 static int BodyState = S_BODY_DATA;
250 * Flag indicating if we are in the process of decoding
251 * a quoted-printable body part.
253 static int CurrEncodingIsQP = 0;
254 static int CurrTypeNeedsDecode = 0;
257 * Delimiter for multipart messages. RFC 2046 states that this must
258 * NEVER be longer than 70 characters. Add 3 for the two hyphens
259 * at the beginning, and a terminating null.
261 #define MAX_DELIM_LEN 70
262 static char MultipartDelimiter[MAX_DELIM_LEN+3];
265 /* This string replaces the "Content-Transfer-Encoding: quoted-printable"
266 * string in all headers, including those in body-parts. The replacement
267 * must be no longer than the original string.
269 static const char ENC8BIT[] = "Content-Transfer-Encoding: 8bit";
270 static void SetEncoding8bit(char *XferEncOfs)
274 if (XferEncOfs != NULL) {
275 memcpy(XferEncOfs, ENC8BIT, sizeof(ENC8BIT) - 1);
277 /* If anything left, in this header, replace with whitespace */
278 for (p=XferEncOfs+sizeof(ENC8BIT)-1; ((unsigned char)*p >= ' '); p++)
283 static char *GetBoundary(char *CntType)
288 /* Find the "boundary" delimiter. It must be preceded with a ';'
289 * and optionally some whitespace.
293 p2 = strchr(p1, ';');
295 for (p2++; isspace((unsigned char)*p2); p2++) { }
298 } while ((p1) && (strncasecmp(p1, "boundary", 8) != 0));
301 /* No boundary delimiter */
304 /* Skip "boundary", whitespace and '='; check that we do have a '=' */
305 for (p1+=8, flag=0; (isspace((unsigned char)*p1) || (*p1 == '=')); p1++)
306 flag |= (*p1 == '=');
310 /* Find end of boundary delimiter string */
312 /* The delimiter is inside quotes */
314 p2 = strchr(p1, '\"');
316 return NULL; /* No closing '"' !?! */
319 /* There might be more text after the "boundary" string. */
320 p2 = strchr(p1, ';'); /* Safe - delimiter with ';' must be in quotes */
323 /* Zero-terminate the boundary string */
327 return (p1 && strlen(p1)) ? p1 : NULL;
331 static int CheckContentType(char *CntType)
334 * Static array of Content-Type's for which we will do
335 * quoted-printable decoding, if requested.
336 * It is probably wise to do this only on known text-only types;
337 * be really careful if you change this.
340 static const char *DecodedTypes[] = {
341 "text/", /* Will match ALL content-type's starting with 'text/' */
349 /* If no Content-Type header, it isn't MIME - don't touch it */
350 if (CntType == NULL) return 0;
352 /* Skip whitespace, if any */
353 for (; isspace((unsigned char)*p); p++) ;
357 (strncasecmp(p, DecodedTypes[i], strlen(DecodedTypes[i]))));
360 return (DecodedTypes[i] != NULL);
365 * This routine does three things:
366 * 1) It determines - based on the message headers - whether the
367 * message body is a MIME message that may hold 8 bit data.
368 * - A message that has a "quoted-printable" or "8bit" transfer
369 * encoding is assumed to contain 8-bit data (when decoded).
370 * - A multipart message is assumed to contain 8-bit data
371 * when decoded (there might be quoted-printable body-parts).
372 * - All other messages are assumed NOT to include 8-bit data.
373 * 2) It determines the delimiter-string used in multi-part message
375 * 3) It sets the initial values of the CurrEncodingIsQP,
376 * CurrTypeNeedsDecode, and BodyState variables, from the header
379 * The return value is a bitmask.
381 int MimeBodyType(char *hdrs, int WantDecode)
384 char *XferEnc, *XferEncOfs, *CntType, *MimeVer, *p;
385 int HdrsFound = 0; /* We only look for three headers */
386 int BodyType; /* Return value */
388 /* Setup for a standard (no MIME, no QP, 7-bit US-ASCII) message */
389 MultipartDelimiter[0] = '\0';
390 CurrEncodingIsQP = CurrTypeNeedsDecode = 0;
391 BodyState = S_BODY_DATA;
394 /* Just in case ... */
398 XferEnc = XferEncOfs = CntType = MimeVer = NULL;
401 if (strncasecmp("Content-Transfer-Encoding:", NxtHdr, 26) == 0) {
406 XferEnc = xstrdup(p);
410 else if (strncasecmp("Content-Type:", NxtHdr, 13) == 0) {
412 * This one is difficult. We cannot use the standard
413 * nxtaddr() routine, since the boundary-delimiter is
414 * (probably) enclosed in quotes - and thus appears
415 * as an rfc822 comment, and nxtaddr() "eats" up any
416 * spaces in the delimiter. So, we have to do this
420 /* Skip the "Content-Type:" part and whitespace after it */
421 for (NxtHdr += 13; ((*NxtHdr == ' ') || (*NxtHdr == '\t')); NxtHdr++) { }
424 * Get the full value of the Content-Type header;
425 * it might span multiple lines. So search for
426 * a newline char, but ignore those that have a
427 * have a TAB or space just after the NL (continued
432 p=strchr((p+1),'\n');
433 } while ( (p != NULL) && ((*(p+1) == '\t') || (*(p+1) == ' ')) );
434 if (p == NULL) p = NxtHdr + strlen(NxtHdr);
437 CntType = (char *)xmalloc(p-NxtHdr+1);
438 strlcpy(CntType, NxtHdr, p-NxtHdr+1);
441 else if (strncasecmp("MIME-Version:", NxtHdr, 13) == 0) {
445 MimeVer = xstrdup(p);
450 NxtHdr = (strchr(NxtHdr, '\n'));
451 if (NxtHdr != NULL) NxtHdr++;
452 } while ((NxtHdr != NULL) && (*NxtHdr) && (HdrsFound != 3));
455 /* Done looking through the headers, now check what they say */
456 if ((MimeVer != NULL) && (strcmp(MimeVer, "1.0") == 0)) {
458 CurrTypeNeedsDecode = CheckContentType(CntType);
460 /* Check Content-Type to see if this is a multipart message */
461 if ( (CntType != NULL) &&
462 ((strncasecmp(CntType, "multipart/mixed", 15) == 0) ||
463 (strncasecmp(CntType, "message/", 8) == 0)) ) {
465 char *p1 = GetBoundary(CntType);
468 /* The actual delimiter is "--" followed by
469 the boundary string */
470 strcpy(MultipartDelimiter, "--");
471 strlcat(MultipartDelimiter, p1, sizeof(MultipartDelimiter));
472 BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
477 * Check Content-Transfer-Encoding, but
478 * ONLY for non-multipart messages (BodyType == 0).
480 if ((XferEnc != NULL) && (BodyType == 0)) {
481 if (strcasecmp(XferEnc, "quoted-printable") == 0) {
482 CurrEncodingIsQP = 1;
483 BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
484 if (WantDecode && CurrTypeNeedsDecode) {
485 SetEncoding8bit(XferEncOfs);
488 else if (strcasecmp(XferEnc, "7bit") == 0) {
489 CurrEncodingIsQP = 0;
490 BodyType = (MSG_IS_7BIT);
492 else if (strcasecmp(XferEnc, "8bit") == 0) {
493 CurrEncodingIsQP = 0;
494 BodyType = (MSG_IS_8BIT);
509 * Decode one line of data containing QP data.
510 * Return flag set if this line ends with a soft line-break.
511 * 'bufp' is modified to point to the end of the output buffer.
513 static int DoOneQPLine(char **bufp, flag delimited, flag issoftline)
516 char *p_in, *p_out, *p;
521 * Special case: line consists of a single =2E and messages are
522 * dot-terminated. Line has to be dot-stuffed after decoding.
524 if (delimited && !issoftline && buf[0]=='=' && !strncmp(*bufp, "=2E\r\n", 5))
526 strcpy(buf, "..\r\n");
532 if (delimited && issoftline && (strncmp(buf, "..", 2) == 0))
535 for (p_out = buf; (*p_in); ) {
536 p = strchr(p_in, '=');
538 /* No more QP data, just move remainder into place */
540 memmove(p_out, p_in, n);
541 p_in += n; p_out += n;
545 /* There are some uncoded chars at the beginning. */
547 memmove(p_out, p_in, n);
552 case '\0': case '\r': case '\n':
553 /* Soft line break, skip '=' */
555 if (*p_in == '\r') p_in++;
556 if (*p_in == '\n') p_in++;
561 /* There is a QP encoded byte */
562 if (qp_char(*(p+1), *(p+2), p_out) == 0) {
566 /* Invalid QP data - pass through unchanged. */
582 /* This is called once per line in the message body. We need to scan
583 * all lines in the message body for the multipart delimiter string,
584 * and handle any body-part headers in such messages (these can toggle
585 * qp-decoding on and off).
587 * Note: Messages that are NOT multipart-messages go through this
588 * routine quickly, since BodyState will always be S_BODY_DATA,
589 * and MultipartDelimiter is NULL.
591 * Return flag set if this line ends with a soft line-break.
592 * 'bufp' is modified to point to the end of the output buffer.
595 int UnMimeBodyline(char **bufp, flag delimited, flag softline)
602 UnMimeHeader(buf); /* Headers in body-parts can be encoded, too! */
603 if ((*buf == '\0') || (*buf == '\n') || (strcmp(buf, "\r\n") == 0)) {
604 BodyState = S_BODY_DATA;
606 else if (strncasecmp("Content-Transfer-Encoding:", buf, 26) == 0) {
609 XferEnc = nxtaddr(buf);
610 if ((XferEnc != NULL) && (strcasecmp(XferEnc, "quoted-printable") == 0)) {
611 CurrEncodingIsQP = 1;
614 * Hmm ... we cannot be really sure that CurrTypeNeedsDecode
615 * has been set - we may not have seen the Content-Type header
616 * yet. But *usually* the Content-Type header comes first, so
617 * this will work. And there is really no way of doing it
618 * "right" as long as we stick with the line-by-line processing.
620 if (CurrTypeNeedsDecode)
621 SetEncoding8bit(buf);
624 else if (strncasecmp("Content-Type:", buf, 13) == 0) {
625 CurrTypeNeedsDecode = CheckContentType(nxtaddr(buf));
628 *bufp = (buf + strlen(buf));
632 if ((*MultipartDelimiter) &&
633 (strncmp(buf, MultipartDelimiter, strlen(MultipartDelimiter)) == 0)) {
634 BodyState = S_BODY_HDR;
635 CurrEncodingIsQP = CurrTypeNeedsDecode = 0;
638 if (CurrEncodingIsQP && CurrTypeNeedsDecode)
639 ret = DoOneQPLine(bufp, delimited, softline);
641 *bufp = (buf + strlen(buf));
653 const char *program_name = "unmime";
656 #define BUFSIZE_INCREMENT 4096
659 #define DBG_FWRITE(B,L,BS,FD) do { if (fwrite((B), (L), (BS), (FD))) { } } while(0)
661 #define DBG_FWRITE(B,L,BS,FD)
664 int main(int argc, char *argv[])
666 unsigned int BufSize;
667 char *buffer, *buf_p;
668 int nl_count, i, bodytype;
670 /* quench warnings about unused arguments */
676 FILE *fd_orig, *fd_conv;
679 /* we don't need snprintf here, but for consistency, we'll use it */
681 snprintf(fnam, sizeof(fnam), "/tmp/i_unmime.%lx", (long)pid);
682 fd_orig = fopen(fnam, "w");
683 snprintf(fnam, sizeof(fnam), "/tmp/o_unmime.%lx", (long)pid);
684 fd_conv = fopen(fnam, "w");
687 BufSize = BUFSIZE_INCREMENT; /* Initial size of buffer */
688 buf_p = buffer = (char *) xmalloc(BufSize);
692 i = fread(buf_p, 1, 1, stdin);
707 if ((unsigned)(buf_p - buffer) == BufSize) {
708 /* Buffer is full! Get more room. */
709 buffer = (char *)xrealloc(buffer, BufSize+BUFSIZE_INCREMENT);
710 buf_p = buffer + BufSize;
711 BufSize += BUFSIZE_INCREMENT;
713 } while ((i > 0) && (nl_count < 2));
716 DBG_FWRITE(buffer, strlen(buffer), 1, fd_orig);
718 UnMimeHeader(buffer);
719 bodytype = MimeBodyType(buffer, 1);
722 DBG_FWRITE(buffer, i, 1, fd_conv);
723 if (fwrite(buffer, i, 1, stdout) < 1) {
729 buf_p = (buffer - 1);
732 i = fread(buf_p, 1, 1, stdin);
733 } while ((i == 1) && (*buf_p != '\n'));
736 DBG_FWRITE(buf, (buf_p - buffer), 1, fd_orig);
738 if (buf_p > buffer) {
739 if (bodytype & MSG_NEEDS_DECODE) {
741 UnMimeBodyline(&buf_p, 0, 0);
743 DBG_FWRITE(buffer, (buf_p - buffer), 1, fd_conv);
744 if (fwrite(buffer, (buf_p - buffer), 1, stdout) < 1) {
749 } while (buf_p > buffer);
753 if (EOF == fflush(stdout)) perror("fflush");