Pileus Git - ~andy/fetchmail/blob - unmime.c

   1 /*
   2  * MIME mail decoding.
   3  *
   4  * This module contains decoding routines for converting
   5  * quoted-printable data into pure 8-bit data, in MIME
   6  * formatted messages.
   7  *
   8  * By Henrik Storner <storner@image.dk>
   9  *
  10  * Configuration file support for fetchmail 4.3.8 by
  11  * Frank Damgaard <frda@post3.tele.dk>
  12  *
  13  */
  14
  15 #include "config.h"
  16 #include <string.h>
  17 #include <stdlib.h>
  18 #include <ctype.h>
  19 #include "fetchmail.h"
  20
  21 static unsigned char unhex(unsigned char c)
  22 {
  23   if ((c >= '0') && (c <= '9'))
  24     return (c - '0');
  25   else if ((c >= 'A') && (c <= 'F'))
  26     return (c - 'A' + 10);
  27   else if ((c >= 'a') && (c <= 'f'))
  28     return (c - 'a' + 10);
  29   else
  30     return c;
  31 }
  32
  33 static int qp_char(unsigned char c1, unsigned char c2, unsigned char *c_out)
  34 {
  35   c1 = unhex(c1);
  36   c2 = unhex(c2);
  37
  38   if ((c1 > 15) || (c2 > 15))
  39     return 1;
  40   else {
  41     *c_out = 16*c1+c2;
  42     return 0;
  43   }
  44 }
  45
  46
  47
  48 /*
  49  * Routines to decode MIME QP-encoded headers, as per RFC 2047.
  50  */
  51
  52 /* States of the decoding state machine */
  53 #define S_COPY_PLAIN        0   /* Just copy, but watch for the QP flag */
  54 #define S_SKIP_MIMEINIT     1   /* Get the encoding, and skip header */
  55 #define S_COPY_MIME         2   /* Decode a sequence of coded characters */
  56
  57 static const char MIMEHDR_INIT[]  = "=?";       /* Start of coded sequence */
  58 static const char MIMEHDR_END[]   = "?=";       /* End of coded sequence */
  59
  60 void UnMimeHeader(unsigned char *hdr)
  61 {
  62   /* Decode a buffer containing data encoded according to RFC
  63    * 2047. This only handles content-transfer-encoding; conversion
  64    * between character sets is not implemented.  In other words: We
  65    * assume the charsets used can be displayed by your mail program
  66    * without problems.
  67    */
  68
  69   /* Note: Decoding is done "in-situ", i.e. without using an
  70    * additional buffer for temp. storage. This is possible, since the
  71    * decoded string will always be shorter than the encoded string,
  72    * due to the en- coding scheme.
  73    */
  74
  75   int  state = S_COPY_PLAIN;
  76   unsigned char *p_in, *p_out, *p;
  77   unsigned char enc;
  78   int  i;
  79
  80   /* Speed up in case this is not a MIME-encoded header */
  81   p = strstr(hdr, MIMEHDR_INIT);
  82   if (p == NULL)
  83     return;   /* No MIME header */
  84
  85   /* Loop through the buffer.
  86    *  p_in : Next char to be processed.
  87    *  p_out: Where to put the next processed char
  88    *  enc  : Encoding used (usually, 'q' = quoted-printable)
  89    */
  90   for (p_out = p_in = hdr; (*p_in); ) {
  91     switch (state) {
  92     case S_COPY_PLAIN:
  93       p = strstr(p_in, MIMEHDR_INIT);
  94       if (p == NULL) {
  95         /*
  96          * No more coded data in buffer,
  97          * just move remainder into place.
  98          */
  99         i = strlen(p_in);   /* How much left */
 100         memmove(p_out, p_in, i);
 101         p_in += i; p_out += i;
 102       }
 103       else {
 104         /* MIME header init found at location p */
 105         if (p > p_in) {
 106           /* There are some uncoded chars at the beginning. */
 107           i = (p - p_in);
 108           memmove(p_out, p_in, i);
 109           p_out += i;
 110         }
 111         p_in = (p + 2);
 112         state = S_SKIP_MIMEINIT;
 113       }
 114       break;
 115
 116     case S_SKIP_MIMEINIT:
 117       /* Mime type definition: "charset?encoding?" */
 118       p = strchr(p_in, '?');
 119       if (p != NULL) {
 120         /* p_in .. (p-1) holds the charset */
 121
 122         /* *(p+1) is the transfer encoding, *(p+2) must be a '?' */
 123         if (*(p+2) == '?') {
 124           enc = tolower(*(p+1));
 125           p_in = p+3;
 126           state = S_COPY_MIME;
 127         }
 128         else
 129           state = S_COPY_PLAIN;
 130       }
 131       else
 132         state = S_COPY_PLAIN;   /* Invalid data */
 133       break;
 134
 135     case S_COPY_MIME:
 136       p = strstr(p_in, MIMEHDR_END);  /* Find end of coded data */
 137       if (p == NULL) p = p_in + strlen(p_in);
 138       for (; (p_in < p); ) {
 139         /* Decode all encoded data */
 140         if (enc == 'q') {
 141           if (*p_in == '=') {
 142             /* Decode one char qp-coded at (p_in+1) and (p_in+2) */
 143             if (qp_char(*(p_in+1), *(p_in+2), p_out) == 0)
 144               p_in += 3;
 145             else {
 146               /* Invalid QP data - pass through unchanged. */
 147               *p_out = *p_in;
 148               p_in++;
 149             }
 150           }
 151           else if (*p_in == '_') {
 152             /*
 153              * RFC 2047: '_' inside encoded word represents 0x20.
 154              * NOT a space - always the value 0x20.
 155              */
 156             *p_out = 0x20;
 157             p_in++;
 158           }
 159           else {
 160             /* Copy unchanged */
 161             *p_out = *p_in;
 162             p_in++;
 163           }
 164           p_out++;
 165         }
 166         else if (enc == 'b') {
 167           /* Decode base64 encoded data */
 168           char delimsave;
 169           int decoded_count;
 170
 171           delimsave = *p; *p = '\r';
 172           decoded_count = from64tobits(p_out, p_in);
 173           *p = delimsave;
 174           if (decoded_count > 0)
 175             p_out += decoded_count;
 176           p_in = p;
 177         }
 178         else {
 179           /* Copy unchanged */
 180           *p_out = *p_in;
 181           p_in++;
 182           p_out++;
 183         }
 184       }
 185       if (*p_in)
 186         p_in += 2;   /* Skip the MIMEHDR_END delimiter */
 187
 188       /*
 189        * We've completed decoding one encoded sequence. But another
 190        * may follow immediately, in which case whitespace before the
 191        * new MIMEHDR_INIT delimiter must be discarded.
 192        * See if that is the case
 193        */
 194       p = strstr(p_in, MIMEHDR_INIT);
 195       state = S_COPY_PLAIN;
 196       if (p != NULL) {
 197         /*
 198          * There is more MIME data later on. Is there
 199          * whitespace  only before the delimiter?
 200          */
 201         unsigned char *q;
 202         int  wsp_only = 1;
 203
 204         for (q=p_in; (wsp_only && (q < p)); q++)
 205           wsp_only = isspace(*q);
 206
 207         if (wsp_only) {
 208           /*
 209            * Whitespace-only before the MIME delimiter. OK,
 210            * just advance p_in to past the new MIMEHDR_INIT,
 211            * and prepare to process the new MIME charset/encoding
 212            * header.
 213            */
 214           p_in = p + strlen(MIMEHDR_INIT);
 215           state = S_SKIP_MIMEINIT;
 216         }
 217       }
 218       break;
 219     }
 220   }
 221
 222   *p_out = '\0';
 223 }
 224
 225
 226
 227 /*
 228  * Routines for decoding body-parts of a message.
 229  *
 230  * Since the "fetch" part of fetchmail gets a message body
 231  * one line at a time, we need to maintain some state variables
 232  * across multiple invokations of the UnMimeBodyline() routine.
 233  * The driver routine should call MimeBodyType() when all
 234  * headers have been received, and then UnMimeBodyline() for
 235  * every line in the message body.
 236  *
 237  */
 238 #define S_BODY_DATA 0
 239 #define S_BODY_HDR  1
 240
 241 /*
 242  * Flag indicating if we are currently processing
 243  * the headers or the body of a (multipart) message.
 244  */
 245 static int  BodyState = S_BODY_DATA;
 246
 247 /*
 248  * Flag indicating if we are in the process of decoding
 249  * a quoted-printable body part.
 250  */
 251 static int  CurrEncodingIsQP = 0;
 252
 253 /*
 254  * Delimiter for multipart messages. RFC 2046 states that this must
 255  * NEVER be longer than 70 characters. Add 3 for the two hyphens
 256  * at the beginning, and a terminating null.
 257  */
 258 #define MAX_DELIM_LEN 70
 259 static unsigned char MultipartDelimiter[MAX_DELIM_LEN+3];
 260
 261
 262 /* This string replaces the "Content-Transfer-Encoding: quoted-printable"
 263  * string in all headers, including those in body-parts. The replacement
 264  * must be no longer than the original string.
 265  */
 266 static const char ENC8BIT[] = "Content-Transfer-Encoding: 8bit";
 267 static void SetEncoding8bit(unsigned char *XferEncOfs)
 268 {
 269   unsigned char *p;
 270
 271   if (XferEncOfs != NULL) {
 272      memcpy(XferEncOfs, ENC8BIT, strlen(ENC8BIT));
 273
 274      /* If anything left, in this header, replace with whitespace */
 275      for (p=XferEncOfs+strlen(ENC8BIT); (*p >= ' '); p++) *p=' ';
 276   }
 277 }
 278
 279 static char *GetBoundary(char *CntType)
 280 {
 281   char *p1, *p2;
 282   int flag;
 283
 284   /* Find the "boundary" delimiter. It must be preceded with a ';'
 285    * and optionally some whitespace.
 286    */
 287   p1 = CntType;
 288   do {
 289     p2 = strchr(p1, ';');
 290     if (p2)
 291       for (p2++; isspace(*p2); p2++);
 292
 293     p1 = p2;
 294   } while ((p1) && (strncasecmp(p1, "boundary", 8) != 0));
 295
 296   if (p1 == NULL)
 297     /* No boundary delimiter */
 298     return NULL;
 299
 300   /* Skip "boundary", whitespace and '='; check that we do have a '=' */
 301   for (p1+=8, flag=0; (isspace(*p1) || (*p1 == '=')); p1++)
 302     flag |= (*p1 == '=');
 303   if (!flag)
 304     return NULL;
 305
 306   /* Find end of boundary delimiter string */
 307   if (*p1 == '\"') {
 308     /* The delimiter is inside quotes */
 309     p1++;
 310     p2 = strchr(p1, '\"');
 311     if (p2 == NULL)
 312       return NULL;  /* No closing '"' !?! */
 313   }
 314   else {
 315     /* There might be more text after the "boundary" string. */
 316     p2 = strchr(p1, ';');  /* Safe - delimiter with ';' must be in quotes */
 317   }
 318
 319   /* Zero-terminate the boundary string */
 320   if (p2 != NULL)
 321     *p2 = '\0';
 322
 323   return (p1 && strlen(p1)) ? p1 : NULL;
 324 }
 325
 326
 327 /*
 328  * This routine does three things:
 329  * 1) It determines - based on the message headers - whether the
 330  *    message body is a MIME message that may hold 8 bit data.
 331  *    - A message that has a "quoted-printable" or "8bit" transfer
 332  *      encoding is assumed to contain 8-bit data (when decoded).
 333  *    - A multipart message is assumed to contain 8-bit data
 334  *      when decoded (there might be quoted-printable body-parts).
 335  *    - All other messages are assumed NOT to include 8-bit data.
 336  * 2) It determines the delimiter-string used in multi-part message
 337  *    bodies.
 338  * 3) It sets the initial values of the CurrEncodingIsQP and BodyState
 339  *    variables, from the header contents.
 340  *
 341  * The return value is a bitmask.
 342  */
 343 int MimeBodyType(unsigned char *hdrs, int WantDecode)
 344 {
 345   unsigned char *NxtHdr = hdrs;
 346   unsigned char *XferEnc, *XferEncOfs, *CntType, *MimeVer, *p;
 347   int  HdrsFound = 0;     /* We only look for three headers */
 348   int  BodyType;          /* Return value */
 349
 350   /* Setup for a standard (no MIME, no QP, 7-bit US-ASCII) message */
 351   MultipartDelimiter[0] = '\0';
 352   CurrEncodingIsQP = 0;
 353   BodyState = S_BODY_DATA;
 354   BodyType = 0;
 355
 356   /* Just in case ... */
 357   if (hdrs == NULL)
 358     return BodyType;
 359
 360   XferEnc = XferEncOfs = CntType = MimeVer = NULL;
 361
 362   do {
 363     if (strncasecmp("Content-Transfer-Encoding:", NxtHdr, 26) == 0) {
 364       XferEncOfs = NxtHdr;
 365       p = nxtaddr(NxtHdr);
 366       if (p != NULL) {
 367         xalloca(XferEnc, char *, strlen(p) + 1);
 368         strcpy(XferEnc, p);
 369         HdrsFound++;
 370       }
 371     }
 372     else if (strncasecmp("Content-Type:", NxtHdr, 13) == 0) {
 373       /*
 374        * This one is difficult. We cannot use the standard
 375        * nxtaddr() routine, since the boundary-delimiter is
 376        * (probably) enclosed in quotes - and thus appears
 377        * as an rfc822 comment, and nxtaddr() "eats" up any
 378        * spaces in the delimiter. So, we have to do this
 379        * by hand.
 380        */
 381
 382       /* Skip the "Content-Type:" part and whitespace after it */
 383       for (NxtHdr += 13; ((*NxtHdr == ' ') || (*NxtHdr == '\t')); NxtHdr++);
 384
 385       /*
 386        * Get the full value of the Content-Type header;
 387        * it might span multiple lines. So search for
 388        * a newline char, but ignore those that have a
 389        * have a TAB or space just after the NL (continued
 390        * lines).
 391        */
 392       p = NxtHdr-1;
 393       do {
 394         p=strchr((p+1),'\n');
 395       } while ( (p != NULL) && ((*(p+1) == '\t') || (*(p+1) == ' ')) );
 396       if (p == NULL) p = NxtHdr + strlen(NxtHdr);
 397
 398       xalloca(CntType, char *, p-NxtHdr+2);
 399       strncpy(CntType, NxtHdr, (p-NxtHdr));
 400       *(CntType+(p-NxtHdr)) = '\0';
 401       HdrsFound++;
 402     }
 403     else if (strncasecmp("MIME-Version:", NxtHdr, 13) == 0) {
 404       p = nxtaddr(NxtHdr);
 405       if (p != NULL) {
 406         xalloca(MimeVer, char *, strlen(p) + 1);
 407         strcpy(MimeVer, p);
 408         HdrsFound++;
 409       }
 410     }
 411
 412     NxtHdr = (strchr(NxtHdr, '\n'));
 413     if (NxtHdr != NULL) NxtHdr++;
 414   } while ((NxtHdr != NULL) && (*NxtHdr) && (HdrsFound != 3));
 415
 416
 417   /* Done looking through the headers, now check what they say */
 418   if ((MimeVer != NULL) && (strcmp(MimeVer, "1.0") == 0)) {
 419
 420     /* Check Content-Type to see if this is a multipart message */
 421     if ( (CntType != NULL) &&
 422          ((strncasecmp(CntType, "multipart/", 10) == 0) ||
 423           (strncasecmp(CntType, "message/", 8) == 0)) ) {
 424
 425       char *p1 = GetBoundary(CntType);
 426
 427       if (p1 != NULL) {
 428         /* The actual delimiter is "--" followed by
 429            the boundary string */
 430         strcpy(MultipartDelimiter, "--");
 431         strncat(MultipartDelimiter, p1, MAX_DELIM_LEN);
 432         BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
 433       }
 434     }
 435
 436     /*
 437      * Check Content-Transfer-Encoding, but
 438      * ONLY for non-multipart messages (BodyType == 0).
 439      */
 440     if ((XferEnc != NULL) && (BodyType == 0)) {
 441       if (strcasecmp(XferEnc, "quoted-printable") == 0) {
 442         CurrEncodingIsQP = 1;
 443         BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
 444         if (WantDecode) {
 445            SetEncoding8bit(XferEncOfs);
 446         }
 447       }
 448       else if (strcasecmp(XferEnc, "7bit") == 0) {
 449         CurrEncodingIsQP = 0;
 450         BodyType = (MSG_IS_7BIT);
 451       }
 452       else if (strcasecmp(XferEnc, "8bit") == 0) {
 453         CurrEncodingIsQP = 0;
 454         BodyType = (MSG_IS_8BIT);
 455       }
 456     }
 457
 458   }
 459
 460   return BodyType;
 461 }
 462
 463
 464 /*
 465  * Decode one line of data containing QP data.
 466  * Return flag set if this line ends with a soft line-break.
 467  * 'bufp' is modified to point to the end of the output buffer.
 468  */
 469 static int DoOneQPLine(unsigned char **bufp, int collapsedoubledot)
 470 {
 471   unsigned char *buf = *bufp;
 472   unsigned char *p_in, *p_out, *p;
 473   int n;
 474   int ret = 0;
 475
 476   p_in = buf;
 477   if (collapsedoubledot && (strncmp(buf, "..", 2) == 0))
 478     p_in++;
 479
 480   for (p_out = buf; (*p_in); ) {
 481     p = strchr(p_in, '=');
 482     if (p == NULL) {
 483       /* No more QP data, just move remainder into place */
 484       n = strlen(p_in);
 485       memmove(p_out, p_in, n);
 486       p_in += n; p_out += n;
 487     }
 488     else {
 489       if (p > p_in) {
 490         /* There are some uncoded chars at the beginning. */
 491         n = (p - p_in);
 492         memmove(p_out, p_in, n);
 493         p_out += n;
 494       }
 495
 496       switch (*(p+1)) {
 497       case '\0': case '\r': case '\n':
 498         /* Soft line break, skip '=' */
 499         p_in = p+1;
 500         if (*p_in == '\r') p_in++;
 501         if (*p_in == '\n') p_in++;
 502         ret = 1;
 503         break;
 504
 505       default:
 506         /* There is a QP encoded byte */
 507         if (qp_char(*(p+1), *(p+2), p_out) == 0) {
 508           p_in = p+3;
 509         }
 510         else {
 511           /* Invalid QP data - pass through unchanged. */
 512           *p_out = '=';
 513           p_in = p+1;
 514         }
 515         p_out++;
 516         break;
 517       }
 518     }
 519   }
 520
 521   *p_out = '\0';
 522   *bufp = p_out;
 523   return ret;
 524 }
 525
 526
 527 /* This is called once per line in the message body.  We need to scan
 528  * all lines in the message body for the multipart delimiter string,
 529  * and handle any body-part headers in such messages (these can toggle
 530  * qp-decoding on and off).
 531  *
 532  * Note: Messages that are NOT multipart-messages go through this
 533  * routine quickly, since BodyState will always be S_BODY_DATA,
 534  * and MultipartDelimiter is NULL.
 535  *
 536  * Return flag set if this line ends with a soft line-break.
 537  * 'bufp' is modified to point to the end of the output buffer.
 538  */
 539
 540 int UnMimeBodyline(unsigned char **bufp, int collapsedoubledot)
 541 {
 542   unsigned char *buf = *bufp;
 543   int ret = 0;
 544
 545   switch (BodyState) {
 546   case S_BODY_HDR:
 547     UnMimeHeader(buf);   /* Headers in body-parts can be encoded, too! */
 548     if (strncasecmp("Content-Transfer-Encoding:", buf, 26) == 0) {
 549       char *XferEnc;
 550
 551       XferEnc = nxtaddr(buf);
 552       if ((XferEnc != NULL) && (strcasecmp(XferEnc, "quoted-printable") == 0)) {
 553         CurrEncodingIsQP = 1;
 554         SetEncoding8bit(buf);
 555       }
 556     }
 557     else if ((*buf == '\0') || (*buf == '\n') || (strcmp(buf, "\r\n") == 0))
 558       BodyState = S_BODY_DATA;
 559
 560     *bufp = (buf + strlen(buf));
 561     break;
 562
 563   case S_BODY_DATA:
 564     if ((*MultipartDelimiter) &&
 565         (strncmp(buf, MultipartDelimiter, strlen(MultipartDelimiter)) == 0)) {
 566       BodyState = S_BODY_HDR;
 567       CurrEncodingIsQP = 0;
 568     }
 569
 570     if (CurrEncodingIsQP)
 571       ret = DoOneQPLine(bufp, collapsedoubledot);
 572     else
 573      *bufp = (buf + strlen(buf));
 574     break;
 575   }
 576
 577   return ret;
 578 }
 579
 580
 581 #ifdef STANDALONE
 582 #include <stdio.h>
 583 #include <unistd.h>
 584
 585 char *program_name = "unmime";
 586
 587 #define BUFSIZE_INCREMENT 4096
 588
 589 #ifdef DEBUG
 590 #define DBG_FWRITE(B,L,BS,FD) fwrite(B, L, BS, FD)
 591 #else
 592 #define DBG_FWRITE(B,L,BS,FD)
 593 #endif
 594
 595 int main(int argc, char *argv[])
 596 {
 597   unsigned int BufSize;
 598   unsigned char *buffer, *buf_p;
 599   int nl_count, i, bodytype;
 600
 601 #ifdef DEBUG
 602   pid_t pid;
 603   FILE *fd_orig, *fd_conv;
 604   char fnam[100];
 605
 606   pid = getpid();
 607   sprintf(fnam, "/tmp/i_unmime.%x", pid);
 608   fd_orig = fopen(fnam, "w");
 609   sprintf(fnam, "/tmp/o_unmime.%x", pid);
 610   fd_conv = fopen(fnam, "w");
 611 #endif
 612
 613   BufSize = BUFSIZE_INCREMENT;    /* Initial size of buffer */
 614   buf_p = buffer = (unsigned char *) xmalloc(BufSize);
 615   nl_count = 0;
 616
 617   do {
 618     i = fread(buf_p, 1, 1, stdin);
 619     switch (*buf_p) {
 620      case '\n':
 621        nl_count++;
 622        break;
 623
 624      case '\r':
 625        break;
 626
 627      default:
 628        nl_count = 0;
 629        break;
 630     }
 631
 632     buf_p++;
 633     if ((buf_p - buffer) == BufSize) {
 634        /* Buffer is full! Get more room. */
 635        buffer = xrealloc(buffer, BufSize+BUFSIZE_INCREMENT);
 636        buf_p = buffer + BufSize;
 637        BufSize += BUFSIZE_INCREMENT;
 638     }
 639   } while ((i > 0) && (nl_count < 2));
 640
 641   *buf_p = '\0';
 642   DBG_FWRITE(buffer, strlen(buffer), 1, fd_orig);
 643
 644   UnMimeHeader(buffer);
 645   bodytype = MimeBodyType(buffer, 1);
 646
 647   i = strlen(buffer);
 648   fwrite(buffer, i, 1, stdout);
 649   DBG_FWRITE(buffer, i, 1, fd_conv);
 650
 651   do {
 652      buf_p = (buffer - 1);
 653      do {
 654         buf_p++;
 655         i = fread(buf_p, 1, 1, stdin);
 656      } while ((i == 1) && (*buf_p != '\n'));
 657      if (i == 1) buf_p++;
 658      *buf_p = '\0';
 659      DBG_FWRITE(buf, (buf_p - buffer), 1, fd_orig);
 660
 661      if (buf_p > buffer) {
 662         if (bodytype & MSG_NEEDS_DECODE) {
 663            buf_p = buffer;
 664            UnMimeBodyline(&buf_p, 0);
 665         }
 666         fwrite(buffer, (buf_p - buffer), 1, stdout);
 667         DBG_FWRITE(buffer, (buf_p - buffer), 1, fd_conv);
 668      }
 669   } while (buf_p > buffer);
 670
 671   free(buffer);
 672   fflush(stdout);
 673
 674 #ifdef DEBUG
 675   fclose(fd_orig);
 676   fclose(fd_conv);
 677 #endif
 678
 679   return 0;
 680 }
 681 #endif
 682