Pileus Git - ~andy/fetchmail/blob - unmime.c

   1 /*
   2  * MIME mail decoding.
   3  *
   4  * This module contains decoding routines for converting
   5  * quoted-printable data into pure 8-bit data, in MIME
   6  * formatted messages.
   7  *
   8  * By Henrik Storner <storner@image.dk>
   9  *
  10  * Configuration file support for fetchmail 4.3.8 by
  11  * Frank Damgaard <frda@post3.tele.dk>
  12  *
  13  */
  14
  15 #include <string.h>
  16 #include <stdlib.h>
  17 #if defined(HAVE_ALLOCA_H)
  18 #include <alloca.h>
  19 #else
  20 #ifdef _AIX
  21  #pragma alloca
  22 #endif
  23 #endif
  24 #include <ctype.h>
  25 #include "fetchmail.h"
  26
  27 static unsigned char unhex(unsigned char c)
  28 {
  29   if ((c >= '0') && (c <= '9'))
  30     return (c - '0');
  31   else if ((c >= 'A') && (c <= 'F'))
  32     return (c - 'A' + 10);
  33   else if ((c >= 'a') && (c <= 'f'))
  34     return (c - 'a' + 10);
  35   else
  36     return c;
  37 }
  38
  39 static int qp_char(unsigned char c1, unsigned char c2, unsigned char *c_out)
  40 {
  41   c1 = unhex(c1);
  42   c2 = unhex(c2);
  43
  44   if ((c1 > 15) || (c2 > 15))
  45     return 1;
  46   else {
  47     *c_out = 16*c1+c2;
  48     return 0;
  49   }
  50 }
  51
  52
  53
  54 /*
  55  * Routines to decode MIME QP-encoded headers, as per RFC 2047.
  56  */
  57
  58 /* States of the decoding state machine */
  59 #define S_COPY_PLAIN        0   /* Just copy, but watch for the QP flag */
  60 #define S_SKIP_MIMEINIT     1   /* Get the encoding, and skip header */
  61 #define S_COPY_MIME         2   /* Decode a sequence of coded characters */
  62
  63 static const char MIMEHDR_INIT[]  = "=?";       /* Start of coded sequence */
  64 static const char MIMEHDR_END[]   = "?=";       /* End of coded sequence */
  65
  66 void UnMimeHeader(unsigned char *hdr)
  67 {
  68   /* Decode a buffer containing data encoded according to RFC
  69    * 2047. This only handles content-transfer-encoding; conversion
  70    * between character sets is not implemented.  In other words: We
  71    * assume the charsets used can be displayed by your mail program
  72    * without problems.
  73    */
  74
  75   /* Note: Decoding is done "in-situ", i.e. without using an
  76    * additional buffer for temp. storage. This is possible, since the
  77    * decoded string will always be shorter than the encoded string,
  78    * due to the en- coding scheme.
  79    */
  80
  81   int  state = S_COPY_PLAIN;
  82   unsigned char *p_in, *p_out, *p;
  83   unsigned char enc;
  84   int  i;
  85
  86   /* Speed up in case this is not a MIME-encoded header */
  87   p = strstr(hdr, MIMEHDR_INIT);
  88   if (p == NULL)
  89     return;   /* No MIME header */
  90
  91   /* Loop through the buffer.
  92    *  p_in : Next char to be processed.
  93    *  p_out: Where to put the next processed char
  94    *  enc  : Encoding used (usually, 'q' = quoted-printable)
  95    */
  96   for (p_out = p_in = hdr; (*p_in); ) {
  97     switch (state) {
  98     case S_COPY_PLAIN:
  99       p = strstr(p_in, MIMEHDR_INIT);
 100       if (p == NULL) {
 101         /*
 102          * No more coded data in buffer,
 103          * just move remainder into place.
 104          */
 105         i = strlen(p_in);   /* How much left */
 106         memmove(p_out, p_in, i);
 107         p_in += i; p_out += i;
 108       }
 109       else {
 110         /* MIME header init found at location p */
 111         if (p > p_in) {
 112           /* There are some uncoded chars at the beginning. */
 113           i = (p - p_in);
 114           memmove(p_out, p_in, i);
 115           p_out += i;
 116         }
 117         p_in = (p + 2);
 118         state = S_SKIP_MIMEINIT;
 119       }
 120       break;
 121
 122     case S_SKIP_MIMEINIT:
 123       /* Mime type definition: "charset?encoding?" */
 124       p = strchr(p_in, '?');
 125       if (p != NULL) {
 126         /* p_in .. (p-1) holds the charset */
 127
 128         /* *(p+1) is the transfer encoding, *(p+2) must be a '?' */
 129         if (*(p+2) == '?') {
 130           enc = tolower(*(p+1));
 131           p_in = p+3;
 132           state = S_COPY_MIME;
 133         }
 134         else
 135           state = S_COPY_PLAIN;
 136       }
 137       else
 138         state = S_COPY_PLAIN;   /* Invalid data */
 139       break;
 140
 141     case S_COPY_MIME:
 142       p = strstr(p_in, MIMEHDR_END);  /* Find end of coded data */
 143       if (p == NULL) p = p_in + strlen(p_in);
 144       for (; (p_in < p); ) {
 145         /* Decode all encoded data */
 146         if (enc == 'q') {
 147           if (*p_in == '=') {
 148             /* Decode one char qp-coded at (p_in+1) and (p_in+2) */
 149             if (qp_char(*(p_in+1), *(p_in+2), p_out) == 0)
 150               p_in += 3;
 151             else {
 152               /* Invalid QP data - pass through unchanged. */
 153               *p_out = *p_in;
 154               p_in++;
 155             }
 156           }
 157           else if (*p_in == '_') {
 158             /*
 159              * RFC 2047: '_' inside encoded word represents 0x20.
 160              * NOT a space - always the value 0x20.
 161              */
 162             *p_out = 0x20;
 163             p_in++;
 164           }
 165           else {
 166             /* Copy unchanged */
 167             *p_out = *p_in;
 168             p_in++;
 169           }
 170           p_out++;
 171         }
 172         else if (enc == 'b') {
 173           /* Decode base64 encoded data */
 174           char delimsave;
 175           int decoded_count;
 176
 177           delimsave = *p; *p = '\r';
 178           decoded_count = from64tobits(p_out, p_in);
 179           *p = delimsave;
 180           if (decoded_count > 0)
 181             p_out += decoded_count;
 182           p_in = p;
 183         }
 184         else {
 185           /* Copy unchanged */
 186           *p_out = *p_in;
 187           p_in++;
 188           p_out++;
 189         }
 190       }
 191       if (*p_in)
 192         p_in += 2;   /* Skip the MIMEHDR_END delimiter */
 193
 194       /*
 195        * We've completed decoding one encoded sequence. But another
 196        * may follow immediately, in which case whitespace before the
 197        * new MIMEHDR_INIT delimiter must be discarded.
 198        * See if that is the case
 199        */
 200       p = strstr(p_in, MIMEHDR_INIT);
 201       state = S_COPY_PLAIN;
 202       if (p != NULL) {
 203         /*
 204          * There is more MIME data later on. Is there
 205          * whitespace  only before the delimiter?
 206          */
 207         unsigned char *q;
 208         int  wsp_only = 1;
 209
 210         for (q=p_in; (wsp_only && (q < p)); q++)
 211           wsp_only = isspace(*q);
 212
 213         if (wsp_only) {
 214           /*
 215            * Whitespace-only before the MIME delimiter. OK,
 216            * just advance p_in to past the new MIMEHDR_INIT,
 217            * and prepare to process the new MIME charset/encoding
 218            * header.
 219            */
 220           p_in = p + strlen(MIMEHDR_INIT);
 221           state = S_SKIP_MIMEINIT;
 222         }
 223       }
 224       break;
 225     }
 226   }
 227
 228   *p_out = '\0';
 229 }
 230
 231
 232
 233 /*
 234  * Routines for decoding body-parts of a message.
 235  *
 236  * Since the "fetch" part of fetchmail gets a message body
 237  * one line at a time, we need to maintain some state variables
 238  * across multiple invokations of the UnMimeBodyline() routine.
 239  * The driver routine should call MimeBodyType() when all
 240  * headers have been received, and then UnMimeBodyline() for
 241  * every line in the message body.
 242  *
 243  */
 244 #define S_BODY_DATA 0
 245 #define S_BODY_HDR  1
 246
 247 /*
 248  * Flag indicating if we are currently processing
 249  * the headers or the body of a (multipart) message.
 250  */
 251 static int  BodyState = S_BODY_DATA;
 252
 253 /*
 254  * Flag indicating if we are in the process of decoding
 255  * a quoted-printable body part.
 256  */
 257 static int  CurrEncodingIsQP = 0;
 258
 259 /*
 260  * Delimiter for multipart messages. RFC 2046 states that this must
 261  * NEVER be longer than 70 characters. Add 3 for the two hyphens
 262  * at the beginning, and a terminating null.
 263  */
 264 #define MAX_DELIM_LEN 70
 265 static unsigned char MultipartDelimiter[MAX_DELIM_LEN+3];
 266
 267
 268 /* This string replaces the "Content-Transfer-Encoding: quoted-printable"
 269  * string in all headers, including those in body-parts. The replacement
 270  * must be no longer than the original string.
 271  */
 272 static const char ENC8BIT[] = "Content-Transfer-Encoding: 8bit";
 273 static void SetEncoding8bit(unsigned char *XferEncOfs)
 274 {
 275   unsigned char *p;
 276
 277   if (XferEncOfs != NULL) {
 278      memcpy(XferEncOfs, ENC8BIT, strlen(ENC8BIT));
 279
 280      /* If anything left, in this header, replace with whitespace */
 281      for (p=XferEncOfs+strlen(ENC8BIT); (*p >= ' '); p++) *p=' ';
 282   }
 283 }
 284
 285 static char *GetBoundary(char *CntType)
 286 {
 287   char *p1, *p2;
 288   int flag;
 289
 290   /* Find the "boundary" delimiter. It must be preceded with a ';'
 291    * and optionally some whitespace.
 292    */
 293   p1 = CntType;
 294   do {
 295     p2 = strchr(p1, ';');
 296     if (p2)
 297       for (p2++; isspace(*p2); p2++);
 298
 299     p1 = p2;
 300   } while ((p1) && (strncasecmp(p1, "boundary", 8) != 0));
 301
 302   if (p1 == NULL)
 303     /* No boundary delimiter */
 304     return NULL;
 305
 306   /* Skip "boundary", whitespace and '='; check that we do have a '=' */
 307   for (p1+=8, flag=0; (isspace(*p1) || (*p1 == '=')); p1++)
 308     flag |= (*p1 == '=');
 309   if (!flag)
 310     return NULL;
 311
 312   /* Find end of boundary delimiter string */
 313   if (*p1 == '\"') {
 314     /* The delimiter is inside quotes */
 315     p1++;
 316     p2 = strchr(p1, '\"');
 317     if (p2 == NULL)
 318       return NULL;  /* No closing '"' !?! */
 319   }
 320   else {
 321     /* There might be more text after the "boundary" string. */
 322     p2 = strchr(p1, ';');  /* Safe - delimiter with ';' must be in quotes */
 323   }
 324
 325   /* Zero-terminate the boundary string */
 326   if (p2 != NULL)
 327     *p2 = '\0';
 328
 329   return (p1 && strlen(p1)) ? p1 : NULL;
 330 }
 331
 332
 333 /*
 334  * This routine does three things:
 335  * 1) It determines - based on the message headers - whether the
 336  *    message body is a MIME message that may hold 8 bit data.
 337  *    - A message that has a "quoted-printable" or "8bit" transfer
 338  *      encoding is assumed to contain 8-bit data (when decoded).
 339  *    - A multipart message is assumed to contain 8-bit data
 340  *      when decoded (there might be quoted-printable body-parts).
 341  *    - All other messages are assumed NOT to include 8-bit data.
 342  * 2) It determines the delimiter-string used in multi-part message
 343  *    bodies.
 344  * 3) It sets the initial values of the CurrEncodingIsQP and BodyState
 345  *    variables, from the header contents.
 346  *
 347  * The return value is a bitmask.
 348  */
 349 int MimeBodyType(unsigned char *hdrs, int WantDecode)
 350 {
 351   unsigned char *NxtHdr = hdrs;
 352   unsigned char *XferEnc, *XferEncOfs, *CntType, *MimeVer, *p;
 353   int  HdrsFound = 0;     /* We only look for three headers */
 354   int  BodyType;          /* Return value */
 355
 356   /* Setup for a standard (no MIME, no QP, 7-bit US-ASCII) message */
 357   MultipartDelimiter[0] = '\0';
 358   CurrEncodingIsQP = 0;
 359   BodyState = S_BODY_DATA;
 360   BodyType = 0;
 361
 362   /* Just in case ... */
 363   if (hdrs == NULL)
 364     return BodyType;
 365
 366   XferEnc = XferEncOfs = CntType = MimeVer = NULL;
 367
 368   do {
 369     if (strncasecmp("Content-Transfer-Encoding:", NxtHdr, 26) == 0) {
 370       XferEncOfs = NxtHdr;
 371       p = nxtaddr(NxtHdr);
 372       if (p != NULL) {
 373         XferEnc = (char *)alloca(strlen(p) + 1);
 374         strcpy(XferEnc, p);
 375         HdrsFound++;
 376       }
 377     }
 378     else if (strncasecmp("Content-Type:", NxtHdr, 13) == 0) {
 379       /*
 380        * This one is difficult. We cannot use the standard
 381        * nxtaddr() routine, since the boundary-delimiter is
 382        * (probably) enclosed in quotes - and thus appears
 383        * as an rfc822 comment, and nxtaddr() "eats" up any
 384        * spaces in the delimiter. So, we have to do this
 385        * by hand.
 386        */
 387
 388       /* Skip the "Content-Type:" part and whitespace after it */
 389       for (NxtHdr += 13; ((*NxtHdr == ' ') || (*NxtHdr == '\t')); NxtHdr++);
 390
 391       /*
 392        * Get the full value of the Content-Type header;
 393        * it might span multiple lines. So search for
 394        * a newline char, but ignore those that have a
 395        * have a TAB or space just after the NL (continued
 396        * lines).
 397        */
 398       p = NxtHdr-1;
 399       do {
 400         p=strchr((p+1),'\n');
 401       } while ( (p != NULL) && ((*(p+1) == '\t') || (*(p+1) == ' ')) );
 402       if (p == NULL) p = NxtHdr + strlen(NxtHdr);
 403
 404       CntType = (char *)alloca(p-NxtHdr+2);
 405       strncpy(CntType, NxtHdr, (p-NxtHdr));
 406       *(CntType+(p-NxtHdr)) = '\0';
 407       HdrsFound++;
 408     }
 409     else if (strncasecmp("MIME-Version:", NxtHdr, 13) == 0) {
 410       p = nxtaddr(NxtHdr);
 411       if (p != NULL) {
 412         MimeVer = (char *)alloca(strlen(p) + 1);
 413         strcpy(MimeVer, p);
 414         HdrsFound++;
 415       }
 416     }
 417
 418     NxtHdr = (strchr(NxtHdr, '\n'));
 419     if (NxtHdr != NULL) NxtHdr++;
 420   } while ((NxtHdr != NULL) && (*NxtHdr) && (HdrsFound != 3));
 421
 422
 423   /* Done looking through the headers, now check what they say */
 424   if ((MimeVer != NULL) && (strcmp(MimeVer, "1.0") == 0)) {
 425
 426     /* Check Content-Type to see if this is a multipart message */
 427     if ( (CntType != NULL) &&
 428          ((strncasecmp(CntType, "multipart/", 10) == 0) ||
 429           (strncasecmp(CntType, "message/", 8) == 0)) ) {
 430
 431       char *p1 = GetBoundary(CntType);
 432
 433       if (p1 != NULL) {
 434         /* The actual delimiter is "--" followed by
 435            the boundary string */
 436         strcpy(MultipartDelimiter, "--");
 437         strncat(MultipartDelimiter, p1, MAX_DELIM_LEN);
 438         BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
 439       }
 440     }
 441
 442     /*
 443      * Check Content-Transfer-Encoding, but
 444      * ONLY for non-multipart messages (BodyType == 0).
 445      */
 446     if ((XferEnc != NULL) && (BodyType == 0)) {
 447       if (strcasecmp(XferEnc, "quoted-printable") == 0) {
 448         CurrEncodingIsQP = 1;
 449         BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
 450         if (WantDecode) {
 451            SetEncoding8bit(XferEncOfs);
 452         }
 453       }
 454       else if (strcasecmp(XferEnc, "7bit") == 0) {
 455         CurrEncodingIsQP = 0;
 456         BodyType = (MSG_IS_7BIT);
 457       }
 458       else if (strcasecmp(XferEnc, "8bit") == 0) {
 459         CurrEncodingIsQP = 0;
 460         BodyType = (MSG_IS_8BIT);
 461       }
 462     }
 463
 464   }
 465
 466   return BodyType;
 467 }
 468
 469
 470 /*
 471  * Decode one line of data containing QP data.
 472  * Return flag set if this line ends with a soft line-break.
 473  * 'bufp' is modified to point to the end of the output buffer.
 474  */
 475 static int DoOneQPLine(unsigned char **bufp, int collapsedoubledot)
 476 {
 477   unsigned char *buf = *bufp;
 478   unsigned char *p_in, *p_out, *p;
 479   int n;
 480   int ret = 0;
 481
 482   p_in = buf;
 483   if (collapsedoubledot && (strncmp(buf, "..", 2) == 0))
 484     p_in++;
 485
 486   for (p_out = buf; (*p_in); ) {
 487     p = strchr(p_in, '=');
 488     if (p == NULL) {
 489       /* No more QP data, just move remainder into place */
 490       n = strlen(p_in);
 491       memmove(p_out, p_in, n);
 492       p_in += n; p_out += n;
 493     }
 494     else {
 495       if (p > p_in) {
 496         /* There are some uncoded chars at the beginning. */
 497         n = (p - p_in);
 498         memmove(p_out, p_in, n);
 499         p_out += n;
 500       }
 501
 502       switch (*(p+1)) {
 503       case '\0': case '\r': case '\n':
 504         /* Soft line break, skip '=' */
 505         p_in = p+1;
 506         if (*p_in == '\r') p_in++;
 507         if (*p_in == '\n') p_in++;
 508         ret = 1;
 509         break;
 510
 511       default:
 512         /* There is a QP encoded byte */
 513         if (qp_char(*(p+1), *(p+2), p_out) == 0) {
 514           p_in = p+3;
 515         }
 516         else {
 517           /* Invalid QP data - pass through unchanged. */
 518           *p_out = '=';
 519           p_in = p+1;
 520         }
 521         p_out++;
 522         break;
 523       }
 524     }
 525   }
 526
 527   *p_out = '\0';
 528   *bufp = p_out;
 529   return ret;
 530 }
 531
 532
 533 /* This is called once per line in the message body.  We need to scan
 534  * all lines in the message body for the multipart delimiter string,
 535  * and handle any body-part headers in such messages (these can toggle
 536  * qp-decoding on and off).
 537  *
 538  * Note: Messages that are NOT multipart-messages go through this
 539  * routine quickly, since BodyState will always be S_BODY_DATA,
 540  * and MultipartDelimiter is NULL.
 541  *
 542  * Return flag set if this line ends with a soft line-break.
 543  * 'bufp' is modified to point to the end of the output buffer.
 544  */
 545
 546 int UnMimeBodyline(unsigned char **bufp, int collapsedoubledot)
 547 {
 548   unsigned char *buf = *bufp;
 549   int ret = 0;
 550
 551   switch (BodyState) {
 552   case S_BODY_HDR:
 553     UnMimeHeader(buf);   /* Headers in body-parts can be encoded, too! */
 554     if (strncasecmp("Content-Transfer-Encoding:", buf, 26) == 0) {
 555       char *XferEnc;
 556
 557       XferEnc = nxtaddr(buf);
 558       if ((XferEnc != NULL) && (strcasecmp(XferEnc, "quoted-printable") == 0)) {
 559         CurrEncodingIsQP = 1;
 560         SetEncoding8bit(buf);
 561       }
 562     }
 563     else if ((*buf == '\0') || (*buf == '\n') || (strcmp(buf, "\r\n") == 0))
 564       BodyState = S_BODY_DATA;
 565
 566     *bufp = (buf + strlen(buf));
 567     break;
 568
 569   case S_BODY_DATA:
 570     if ((*MultipartDelimiter) &&
 571         (strncmp(buf, MultipartDelimiter, strlen(MultipartDelimiter)) == 0)) {
 572       BodyState = S_BODY_HDR;
 573       CurrEncodingIsQP = 0;
 574     }
 575
 576     if (CurrEncodingIsQP)
 577       ret = DoOneQPLine(bufp, collapsedoubledot);
 578     else
 579      *bufp = (buf + strlen(buf));
 580     break;
 581   }
 582
 583   return ret;
 584 }
 585
 586
 587 #ifdef STANDALONE
 588 #include <stdio.h>
 589 #include <unistd.h>
 590
 591 char *program_name = "unmime";
 592
 593 #define BUFSIZE_INCREMENT 4096
 594
 595 #ifdef DEBUG
 596 #define DBG_FWRITE(B,L,BS,FD) fwrite(B, L, BS, FD)
 597 #else
 598 #define DBG_FWRITE(B,L,BS,FD)
 599 #endif
 600
 601 int main(int argc, char *argv[])
 602 {
 603   unsigned int BufSize;
 604   unsigned char *buffer, *buf_p;
 605   int nl_count, i, bodytype;
 606
 607 #ifdef DEBUG
 608   pid_t pid;
 609   FILE *fd_orig, *fd_conv;
 610   char fnam[100];
 611
 612   pid = getpid();
 613   sprintf(fnam, "/tmp/i_unmime.%x", pid);
 614   fd_orig = fopen(fnam, "w");
 615   sprintf(fnam, "/tmp/o_unmime.%x", pid);
 616   fd_conv = fopen(fnam, "w");
 617 #endif
 618
 619   BufSize = BUFSIZE_INCREMENT;    /* Initial size of buffer */
 620   buf_p = buffer = (unsigned char *) xmalloc(BufSize);
 621   nl_count = 0;
 622
 623   do {
 624     i = fread(buf_p, 1, 1, stdin);
 625     switch (*buf_p) {
 626      case '\n':
 627        nl_count++;
 628        break;
 629
 630      case '\r':
 631        break;
 632
 633      default:
 634        nl_count = 0;
 635        break;
 636     }
 637
 638     buf_p++;
 639     if ((buf_p - buffer) == BufSize) {
 640        /* Buffer is full! Get more room. */
 641        buffer = xrealloc(buffer, BufSize+BUFSIZE_INCREMENT);
 642        buf_p = buffer + BufSize;
 643        BufSize += BUFSIZE_INCREMENT;
 644     }
 645   } while ((i > 0) && (nl_count < 2));
 646
 647   *buf_p = '\0';
 648   DBG_FWRITE(buffer, strlen(buffer), 1, fd_orig);
 649
 650   UnMimeHeader(buffer);
 651   bodytype = MimeBodyType(buffer, 1);
 652
 653   i = strlen(buffer);
 654   fwrite(buffer, i, 1, stdout);
 655   DBG_FWRITE(buffer, i, 1, fd_conv);
 656
 657   do {
 658      buf_p = (buffer - 1);
 659      do {
 660         buf_p++;
 661         i = fread(buf_p, 1, 1, stdin);
 662      } while ((i == 1) && (*buf_p != '\n'));
 663      if (i == 1) buf_p++;
 664      *buf_p = '\0';
 665      DBG_FWRITE(buf, (buf_p - buffer), 1, fd_orig);
 666
 667      if (buf_p > buffer) {
 668         if (bodytype & MSG_NEEDS_DECODE) {
 669            buf_p = buffer;
 670            UnMimeBodyline(&buf_p, 0);
 671         }
 672         fwrite(buffer, (buf_p - buffer), 1, stdout);
 673         DBG_FWRITE(buffer, (buf_p - buffer), 1, fd_conv);
 674      }
 675   } while (buf_p > buffer);
 676
 677   free(buffer);
 678   fflush(stdout);
 679
 680 #ifdef DEBUG
 681   fclose(fd_orig);
 682   fclose(fd_conv);
 683 #endif
 684
 685   return 0;
 686 }
 687 #endif
 688