Pileus Git - ~andy/fetchmail/blob - unmime.c

   1 /*
   2  * MIME mail decoding.
   3  *
   4  * This module contains decoding routines for converting
   5  * quoted-printable data into pure 8-bit data, in MIME
   6  * formatted messages.
   7  *
   8  * By Henrik Storner <storner@image.dk>
   9  *
  10  * Configuration file support for fetchmail 4.3.8 by
  11  * Frank Damgaard <frda@post3.tele.dk>
  12  *
  13  */
  14
  15 #include <string.h>
  16 #include <stdlib.h>
  17 #include <ctype.h>
  18 #include "fetchmail.h"
  19
  20 static inline unsigned char unhex(unsigned char c)
  21 {
  22   if ((c >= '0') && (c <= '9'))
  23     return (c - '0');
  24   else if ((c >= 'A') && (c <= 'F'))
  25     return (c - 'A' + 10);
  26   else if ((c >= 'a') && (c <= 'f'))
  27     return (c - 'a' + 10);
  28   else
  29     return c;
  30 }
  31
  32 static int qp_char(unsigned char c1, unsigned char c2, unsigned char *c_out)
  33 {
  34   c1 = unhex(c1);
  35   c2 = unhex(c2);
  36
  37   if ((c1 > 15) || (c2 > 15))
  38     return 1;
  39   else {
  40     *c_out = 16*c1+c2;
  41     return 0;
  42   }
  43 }
  44
  45
  46
  47 /*
  48  * Routines to decode MIME QP-encoded headers, as per RFC 2047.
  49  */
  50
  51 /* States of the decoding state machine */
  52 #define S_COPY_PLAIN        0   /* Just copy, but watch for the QP flag */
  53 #define S_SKIP_MIMEINIT     1   /* Get the encoding, and skip header */
  54 #define S_COPY_MIME         2   /* Decode a sequence of coded characters */
  55
  56 static const char MIMEHDR_INIT[]  = "=?";       /* Start of coded sequence */
  57 static const char MIMEHDR_END[]   = "?=";       /* End of coded sequence */
  58
  59 void UnMimeHeader(unsigned char *hdr)
  60 {
  61   /* Decode a buffer containing data encoded according to RFC
  62    * 2047. This only handles content-transfer-encoding; conversion
  63    * between character sets is not implemented.  In other words: We
  64    * assume the charsets used can be displayed by your mail program
  65    * without problems.
  66    */
  67
  68   /* Note: Decoding is done "in-situ", i.e. without using an
  69    * additional buffer for temp. storage. This is possible, since the
  70    * decoded string will always be shorter than the encoded string,
  71    * due to the en- coding scheme.
  72    */
  73
  74   int  state = S_COPY_PLAIN;
  75   unsigned char *p_in, *p_out, *p;
  76   unsigned char enc;
  77   int  i;
  78
  79   /* Speed up in case this is not a MIME-encoded header */
  80   p = strstr(hdr, MIMEHDR_INIT);
  81   if (p == NULL)
  82     return;   /* No MIME header */
  83
  84   /* Loop through the buffer.
  85    *  p_in : Next char to be processed.
  86    *  p_out: Where to put the next processed char
  87    *  enc  : Encoding used (usually, 'q' = quoted-printable)
  88    */
  89   for (p_out = p_in = hdr; (*p_in); ) {
  90     switch (state) {
  91     case S_COPY_PLAIN:
  92       p = strstr(p_in, MIMEHDR_INIT);
  93       if (p == NULL) {
  94         /*
  95          * No more coded data in buffer,
  96          * just move remainder into place.
  97          */
  98         i = strlen(p_in);   /* How much left */
  99         memmove(p_out, p_in, i);
 100         p_in += i; p_out += i;
 101       }
 102       else {
 103         /* MIME header init found at location p */
 104         if (p > p_in) {
 105           /* There are some uncoded chars at the beginning. */
 106           i = (p - p_in);
 107           memmove(p_out, p_in, i);
 108           p_out += i;
 109         }
 110         p_in = (p + 2);
 111         state = S_SKIP_MIMEINIT;
 112       }
 113       break;
 114
 115     case S_SKIP_MIMEINIT:
 116       /* Mime type definition: "charset?encoding?" */
 117       p = strchr(p_in, '?');
 118       if (p != NULL) {
 119         /* p_in .. (p-1) holds the charset */
 120
 121         /* *(p+1) is the transfer encoding, *(p+2) must be a '?' */
 122         if (*(p+2) == '?') {
 123           enc = tolower(*(p+1));
 124           p_in = p+3;
 125           state = S_COPY_MIME;
 126         }
 127         else
 128           state = S_COPY_PLAIN;
 129       }
 130       else
 131         state = S_COPY_PLAIN;   /* Invalid data */
 132       break;
 133
 134     case S_COPY_MIME:
 135       p = strstr(p_in, MIMEHDR_END);  /* Find end of coded data */
 136       if (p == NULL) p = p_in + strlen(p_in);
 137       for (; (p_in < p); ) {
 138         /* Decode all encoded data */
 139         if (enc == 'q') {
 140           if (*p_in == '=') {
 141             /* Decode one char qp-coded at (p_in+1) and (p_in+2) */
 142             if (qp_char(*(p_in+1), *(p_in+2), p_out) == 0)
 143               p_in += 3;
 144             else {
 145               /* Invalid QP data - pass through unchanged. */
 146               *p_out = *p_in;
 147               p_in++;
 148             }
 149           }
 150           else if (*p_in == '_') {
 151             /*
 152              * RFC 2047: '_' inside encoded word represents 0x20.
 153              * NOT a space - always the value 0x20.
 154              */
 155             *p_out = 0x20;
 156             p_in++;
 157           }
 158           else {
 159             /* Copy unchanged */
 160             *p_out = *p_in;
 161             p_in++;
 162           }
 163           p_out++;
 164         }
 165         else if (enc == 'b') {
 166           /* Decode base64 encoded data */
 167           char delimsave;
 168           int decoded_count;
 169
 170           delimsave = *p; *p = '\r';
 171           decoded_count = from64tobits(p_out, p_in);
 172           *p = delimsave;
 173           if (decoded_count > 0)
 174             p_out += decoded_count;
 175           p_in = p;
 176         }
 177         else {
 178           /* Copy unchanged */
 179           *p_out = *p_in;
 180           p_in++;
 181           p_out++;
 182         }
 183       }
 184       if (*p_in)
 185         p_in += 2;   /* Skip the MIMEHDR_END delimiter */
 186
 187       /*
 188        * We've completed decoding one encoded sequence. But another
 189        * may follow immediately, in which case whitespace before the
 190        * new MIMEHDR_INIT delimiter must be discarded.
 191        * See if that is the case
 192        */
 193       p = strstr(p_in, MIMEHDR_INIT);
 194       state = S_COPY_PLAIN;
 195       if (p != NULL) {
 196         /*
 197          * There is more MIME data later on. Is there
 198          * whitespace  only before the delimiter?
 199          */
 200         unsigned char *q;
 201         int  wsp_only = 1;
 202
 203         for (q=p_in; (wsp_only && (q < p)); q++)
 204           wsp_only = isspace(*q);
 205
 206         if (wsp_only) {
 207           /*
 208            * Whitespace-only before the MIME delimiter. OK,
 209            * just advance p_in to past the new MIMEHDR_INIT,
 210            * and prepare to process the new MIME charset/encoding
 211            * header.
 212            */
 213           p_in = p + strlen(MIMEHDR_INIT);
 214           state = S_SKIP_MIMEINIT;
 215         }
 216       }
 217       break;
 218     }
 219   }
 220
 221   *p_out = '\0';
 222 }
 223
 224
 225
 226 /*
 227  * Routines for decoding body-parts of a message.
 228  *
 229  * Since the "fetch" part of fetchmail gets a message body
 230  * one line at a time, we need to maintain some state variables
 231  * across multiple invokations of the UnMimeBodyline() routine.
 232  * The driver routine should call MimeBodyType() when all
 233  * headers have been received, and then UnMimeBodyline() for
 234  * every line in the message body.
 235  *
 236  */
 237 #define S_BODY_DATA 0
 238 #define S_BODY_HDR  1
 239
 240 /*
 241  * Flag indicating if we are currently processing
 242  * the headers or the body of a (multipart) message.
 243  */
 244 static int  BodyState = S_BODY_DATA;
 245
 246 /*
 247  * Flag indicating if we are in the process of decoding
 248  * a quoted-printable body part.
 249  */
 250 static int  CurrEncodingIsQP = 0;
 251
 252 /*
 253  * Delimiter for multipart messages. RFC 2046 states that this must
 254  * NEVER be longer than 70 characters. Add 3 for the two hyphens
 255  * at the beginning, and a terminating null.
 256  */
 257 #define MAX_DELIM_LEN 70
 258 static unsigned char MultipartDelimiter[MAX_DELIM_LEN+3];
 259
 260
 261 /* This string replaces the "Content-Transfer-Encoding: quoted-printable"
 262  * string in all headers, including those in body-parts. It must be
 263  * no longer than the original string.
 264  */
 265 static const char ENC8BIT[] = "Content-Transfer-Encoding: 8bit";
 266 static void SetEncoding8bit(unsigned char *XferEncOfs)
 267 {
 268   unsigned char *p;
 269
 270   if (XferEncOfs != NULL) {
 271      memcpy(XferEncOfs, ENC8BIT, strlen(ENC8BIT));
 272
 273      /* If anything left, in this header, replace with whitespace */
 274      for (p=XferEncOfs+strlen(ENC8BIT); (*p >= ' '); p++) *p=' ';
 275   }
 276 }
 277
 278
 279 /*
 280  * This routine does three things:
 281  * 1) It determines - based on the message headers - whether the
 282  *    message body is a MIME message that may hold 8 bit data.
 283  *    - A message that has a "quoted-printable" or "8bit" transfer
 284  *      encoding is assumed to contain 8-bit data (when decoded).
 285  *    - A multipart message is assumed to contain 8-bit data
 286  *      when decoded (there might be quoted-printable body-parts).
 287  *    - All other messages are assumed NOT to include 8-bit data.
 288  * 2) It determines the delimiter-string used in multi-part message
 289  *    bodies.
 290  * 3) It sets the initial values of the CurrEncodingIsQP and BodyState
 291  *    variables, from the header contents.
 292  *
 293  * The return value is a bitmask.
 294  */
 295 int MimeBodyType(unsigned char *hdrs)
 296 {
 297   unsigned char *NxtHdr = hdrs;
 298   unsigned char *XferEnc, *XferEncOfs, *CntType, *MimeVer, *p;
 299   int  HdrsFound = 0;     /* We only look for three headers */
 300   int  BodyType;          /* Return value */
 301
 302   /* Setup for a standard (no MIME, no QP, 7-bit US-ASCII) message */
 303   MultipartDelimiter[0] = '\0';
 304   CurrEncodingIsQP = 0;
 305   BodyState = S_BODY_DATA;
 306   BodyType = 0;
 307
 308   /* Just in case ... */
 309   if (hdrs == NULL)
 310     return BodyType;
 311
 312   XferEnc = XferEncOfs = CntType = MimeVer = NULL;
 313
 314   do {
 315     if (strncasecmp("Content-Transfer-Encoding:", NxtHdr, 26) == 0) {
 316       XferEncOfs = NxtHdr;
 317       p = nxtaddr(NxtHdr);
 318       if (p != NULL) {
 319         XferEnc = (char *)xmalloc(strlen(p) + 1);
 320         strcpy(XferEnc, p);
 321         HdrsFound++;
 322       }
 323     }
 324     else if (strncasecmp("Content-Type:", NxtHdr, 13) == 0) {
 325       /*
 326        * This one is difficult. We cannot use the standard
 327        * nxtaddr() routine, since the boundary-delimiter is
 328        * (probably) enclosed in quotes - and thus appears
 329        * as an rfc822 comment, and nxtaddr() "eats" up any
 330        * spaces in the delimiter. So, we have to do this
 331        * by hand.
 332        */
 333
 334       /* Skip the "Content-Type:" part and whitespace after it */
 335       for (NxtHdr += 13; ((*NxtHdr == ' ') || (*NxtHdr == '\t')); NxtHdr++);
 336
 337       /*
 338        * Get the full value of the Content-Type header;
 339        * it might span multiple lines. So search for
 340        * a newline char, but ignore those that have a
 341        * have a TAB or space just after the NL (continued
 342        * lines).
 343        */
 344       p = NxtHdr-1;
 345       do {
 346         p=strchr((p+1),'\n');
 347       } while ( (p != NULL) && ((*(p+1) == '\t') || (*(p+1) == ' ')) );
 348       if (p == NULL) p = NxtHdr + strlen(NxtHdr);
 349
 350       CntType = (char *)xmalloc(p-NxtHdr+2);
 351       strncpy(CntType, NxtHdr, (p-NxtHdr));
 352       *(CntType+(p-NxtHdr)) = '\0';
 353       HdrsFound++;
 354     }
 355     else if (strncasecmp("MIME-Version:", NxtHdr, 13) == 0) {
 356       p = nxtaddr(NxtHdr);
 357       if (p != NULL) {
 358         MimeVer = (char *)xmalloc(strlen(p) + 1);
 359         strcpy(MimeVer, p);
 360         HdrsFound++;
 361       }
 362     }
 363
 364     NxtHdr = (strchr(NxtHdr, '\n'));
 365     if (NxtHdr != NULL) NxtHdr++;
 366   } while ((NxtHdr != NULL) && (*NxtHdr) && (HdrsFound != 3));
 367
 368
 369   /* Done looking through the headers, now check what they say */
 370   if ((MimeVer != NULL) && (strcmp(MimeVer, "1.0") == 0)) {
 371
 372     /* Check Content-Type to see if this is a multipart message */
 373     if (CntType != NULL) {
 374       if ((strncasecmp(CntType, "multipart/", 10) == 0) ||
 375           (strncasecmp(CntType, "message/", 8) == 0)) {
 376
 377         char *p1, *p2;
 378
 379         /* Search for "boundary=" */
 380         p1 = strchr(CntType, '=');
 381         if (p1 != NULL) {
 382           /* Skip the '=' and any whitespace after it */
 383           for (p1++; (isspace(*p1)); p1++);
 384
 385           /* The delimiter might be inside quotes */
 386           if (*p1 == '\"') {
 387             p1++;
 388             p2 = strchr(p1, '\"');
 389             if (p2 != NULL)
 390               *p2 = '\0';
 391           }
 392
 393           if (strlen(p1) > 0) {
 394             /* The actual delimiter is "--" followed by
 395                the boundary string */
 396             strcpy(MultipartDelimiter, "--");
 397             strncat(MultipartDelimiter, p1, MAX_DELIM_LEN);
 398             BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
 399           }
 400         }
 401       }
 402     }
 403
 404     /*
 405      * Check Content-Transfer-Encoding, but
 406      * ONLY for non-multipart messages (BodyType == 0).
 407      */
 408     if ((XferEnc != NULL) && (BodyType == 0)) {
 409       if (strcasecmp(XferEnc, "quoted-printable") == 0) {
 410         CurrEncodingIsQP = 1;
 411         BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
 412         SetEncoding8bit(XferEncOfs);
 413       }
 414       else if (strcasecmp(XferEnc, "7bit") == 0) {
 415         CurrEncodingIsQP = 0;
 416         BodyType = (MSG_IS_7BIT);
 417       }
 418       else if (strcasecmp(XferEnc, "8bit") == 0) {
 419         CurrEncodingIsQP = 0;
 420         BodyType = (MSG_IS_8BIT);
 421       }
 422     }
 423
 424   }
 425
 426   if (MimeVer) free(MimeVer);
 427   if (XferEnc) free(XferEnc);
 428   if (CntType) free(CntType);
 429
 430   return BodyType;
 431 }
 432
 433
 434 /*
 435  * Decode one line of data containing QP data.
 436  * Return flag set if this line ends with a soft line-break.
 437  * 'bufp' is modified to point to the end of the output buffer.
 438  */
 439 static int DoOneQPLine(unsigned char **bufp, int collapsedoubledot)
 440 {
 441   unsigned char *buf = *bufp;
 442   unsigned char *p_in, *p_out, *p;
 443   int n;
 444   int ret = 0;
 445
 446   p_in = buf;
 447   if (collapsedoubledot && (strncmp(buf, "..", 2) == 0))
 448     p_in++;
 449
 450   for (p_out = buf; (*p_in); ) {
 451     p = strchr(p_in, '=');
 452     if (p == NULL) {
 453       /* No more QP data, just move remainder into place */
 454       n = strlen(p_in);
 455       memmove(p_out, p_in, n);
 456       p_in += n; p_out += n;
 457     }
 458     else {
 459       if (p > p_in) {
 460         /* There are some uncoded chars at the beginning. */
 461         n = (p - p_in);
 462         memmove(p_out, p_in, n);
 463         p_out += n;
 464       }
 465
 466       switch (*(p+1)) {
 467       case '\0': case '\r': case '\n':
 468         /* Soft line break, skip '=' */
 469         p_in = p+1;
 470         if (*p_in == '\r') p_in++;
 471         if (*p_in == '\n') p_in++;
 472         ret = 1;
 473         break;
 474
 475       default:
 476         /* There is a QP encoded byte */
 477         if (qp_char(*(p+1), *(p+2), p_out) == 0) {
 478           p_in = p+3;
 479         }
 480         else {
 481           /* Invalid QP data - pass through unchanged. */
 482           *p_out = '=';
 483           p_in = p+1;
 484         }
 485         p_out++;
 486         break;
 487       }
 488     }
 489   }
 490
 491   *p_out = '\0';
 492   *bufp = p_out;
 493   return ret;
 494 }
 495
 496
 497 /* This is called once per line in the message body.  We need to scan
 498  * all lines in the message body for the multipart delimiter string,
 499  * and handle any body-part headers in such messages (these can toggle
 500  * qp-decoding on and off).
 501  *
 502  * Note: Messages that are NOT multipart-messages go through this
 503  * routine quickly, since BodyState will always be S_BODY_DATA,
 504  * and MultipartDelimiter is NULL.
 505  *
 506  * Return flag set if this line ends with a soft line-break.
 507  * 'bufp' is modified to point to the end of the output buffer.
 508  */
 509
 510 int UnMimeBodyline(unsigned char **bufp, int collapsedoubledot)
 511 {
 512   unsigned char *buf = *bufp;
 513   int ret = 0;
 514
 515   switch (BodyState) {
 516   case S_BODY_HDR:
 517     UnMimeHeader(buf);   /* Headers in body-parts can be encoded, too! */
 518     if (strncasecmp("Content-Transfer-Encoding:", buf, 26) == 0) {
 519       char *XferEnc;
 520
 521       XferEnc = nxtaddr(buf);
 522       if ((XferEnc != NULL) && (strcasecmp(XferEnc, "quoted-printable") == 0)) {
 523         CurrEncodingIsQP = 1;
 524         SetEncoding8bit(buf);
 525       }
 526     }
 527     else if ((*buf == '\0') || (*buf == '\n') || (strcmp(buf, "\r\n") == 0))
 528       BodyState = S_BODY_DATA;
 529
 530     *bufp = (buf + strlen(buf));
 531     break;
 532
 533   case S_BODY_DATA:
 534     if ((*MultipartDelimiter) &&
 535         (strncmp(buf, MultipartDelimiter, strlen(MultipartDelimiter)) == 0)) {
 536       BodyState = S_BODY_HDR;
 537       CurrEncodingIsQP = 0;
 538     }
 539
 540     if (CurrEncodingIsQP)
 541       ret = DoOneQPLine(bufp, collapsedoubledot);
 542     else
 543      *bufp = (buf + strlen(buf));
 544     break;
 545   }
 546
 547   return ret;
 548 }
 549
 550
 551 #ifdef STANDALONE
 552 #include <stdio.h>
 553 #include <unistd.h>
 554
 555 char *program_name = "unmime";
 556
 557 #define BUFSIZE_INCREMENT 4096
 558
 559 #ifdef DEBUG
 560 #define DBG_FWRITE(B,L,BS,FD) fwrite(B, L, BS, FD)
 561 #else
 562 #define DBG_FWRITE(B,L,BS,FD)
 563 #endif
 564
 565 int main(int argc, char *argv[])
 566 {
 567   unsigned int BufSize;
 568   unsigned char *buffer, *buf_p;
 569   int nl_count, i, bodytype;
 570
 571 #ifdef DEBUG
 572   pid_t pid;
 573   FILE *fd_orig, *fd_conv;
 574   char fnam[100];
 575
 576   pid = getpid();
 577   sprintf(fnam, "/tmp/i_unmime.%x", pid);
 578   fd_orig = fopen(fnam, "w");
 579   sprintf(fnam, "/tmp/o_unmime.%x", pid);
 580   fd_conv = fopen(fnam, "w");
 581 #endif
 582
 583   BufSize = BUFSIZE_INCREMENT;    /* Initial size of buffer */
 584   buf_p = buffer = (unsigned char *) xmalloc(BufSize);
 585   nl_count = 0;
 586
 587   do {
 588     i = fread(buf_p, 1, 1, stdin);
 589     switch (*buf_p) {
 590      case '\n':
 591        nl_count++;
 592        break;
 593
 594      case '\r':
 595        break;
 596
 597      default:
 598        nl_count = 0;
 599        break;
 600     }
 601
 602     buf_p++;
 603     if ((buf_p - buffer) == BufSize) {
 604        /* Buffer is full! Get more room. */
 605        buffer = xrealloc(buffer, BufSize+BUFSIZE_INCREMENT);
 606        buf_p = buffer + BufSize;
 607        BufSize += BUFSIZE_INCREMENT;
 608     }
 609   } while ((i > 0) && (nl_count < 2));
 610
 611   *buf_p = '\0';
 612   DBG_FWRITE(buffer, strlen(buffer), 1, fd_orig);
 613
 614   UnMimeHeader(buffer);
 615   bodytype = MimeBodyType(buffer);
 616
 617   i = strlen(buffer);
 618   fwrite(buffer, i, 1, stdout);
 619   DBG_FWRITE(buffer, i, 1, fd_conv);
 620
 621   do {
 622      buf_p = (buffer - 1);
 623      do {
 624         buf_p++;
 625         i = fread(buf_p, 1, 1, stdin);
 626      } while ((i == 1) && (*buf_p != '\n'));
 627      if (i == 1) buf_p++;
 628      *buf_p = '\0';
 629      DBG_FWRITE(buf, (buf_p - buffer), 1, fd_orig);
 630
 631      if (buf_p > buffer) {
 632         if (bodytype & MSG_NEEDS_DECODE) {
 633            buf_p = buffer;
 634            UnMimeBodyline(&buf_p, 0);
 635         }
 636         fwrite(buffer, (buf_p - buffer), 1, stdout);
 637         DBG_FWRITE(buffer, (buf_p - buffer), 1, fd_conv);
 638      }
 639   } while (buf_p > buffer);
 640
 641   free(buffer);
 642   fflush(stdout);
 643
 644 #ifdef DEBUG
 645   fclose(fd_orig);
 646   fclose(fd_conv);
 647 #endif
 648
 649   return 0;
 650 }
 651 #endif
 652