Pileus Git - ~andy/fetchmail/blob - unmime.c

   1 /*
   2  * MIME mail decoding.
   3  *
   4  * This module contains decoding routines for converting
   5  * quoted-printable data into pure 8-bit data, in MIME
   6  * formatted messages.
   7  *
   8  * By Henrik Storner <storner@image.dk>
   9  *
  10  * Configuration file support for fetchmail 4.3.8 by
  11  * Frank Damgaard <frda@post3.tele.dk>
  12  *
  13  */
  14
  15 #include <string.h>
  16 #include <stdlib.h>
  17 #include <ctype.h>
  18 #include "fetchmail.h"
  19
  20 static unsigned char unhex(unsigned char c)
  21 {
  22   if ((c >= '0') && (c <= '9'))
  23     return (c - '0');
  24   else if ((c >= 'A') && (c <= 'F'))
  25     return (c - 'A' + 10);
  26   else if ((c >= 'a') && (c <= 'f'))
  27     return (c - 'a' + 10);
  28   else
  29     return c;
  30 }
  31
  32 static int qp_char(unsigned char c1, unsigned char c2, unsigned char *c_out)
  33 {
  34   c1 = unhex(c1);
  35   c2 = unhex(c2);
  36
  37   if ((c1 > 15) || (c2 > 15))
  38     return 1;
  39   else {
  40     *c_out = 16*c1+c2;
  41     return 0;
  42   }
  43 }
  44
  45
  46
  47 /*
  48  * Routines to decode MIME QP-encoded headers, as per RFC 2047.
  49  */
  50
  51 /* States of the decoding state machine */
  52 #define S_COPY_PLAIN        0   /* Just copy, but watch for the QP flag */
  53 #define S_SKIP_MIMEINIT     1   /* Get the encoding, and skip header */
  54 #define S_COPY_MIME         2   /* Decode a sequence of coded characters */
  55
  56 static const char MIMEHDR_INIT[]  = "=?";       /* Start of coded sequence */
  57 static const char MIMEHDR_END[]   = "?=";       /* End of coded sequence */
  58
  59 void UnMimeHeader(unsigned char *hdr)
  60 {
  61   /* Decode a buffer containing data encoded according to RFC
  62    * 2047. This only handles content-transfer-encoding; conversion
  63    * between character sets is not implemented.  In other words: We
  64    * assume the charsets used can be displayed by your mail program
  65    * without problems.
  66    */
  67
  68   /* Note: Decoding is done "in-situ", i.e. without using an
  69    * additional buffer for temp. storage. This is possible, since the
  70    * decoded string will always be shorter than the encoded string,
  71    * due to the en- coding scheme.
  72    */
  73
  74   int  state = S_COPY_PLAIN;
  75   unsigned char *p_in, *p_out, *p;
  76   unsigned char enc;
  77   int  i;
  78
  79   /* Speed up in case this is not a MIME-encoded header */
  80   p = strstr(hdr, MIMEHDR_INIT);
  81   if (p == NULL)
  82     return;   /* No MIME header */
  83
  84   /* Loop through the buffer.
  85    *  p_in : Next char to be processed.
  86    *  p_out: Where to put the next processed char
  87    *  enc  : Encoding used (usually, 'q' = quoted-printable)
  88    */
  89   for (p_out = p_in = hdr; (*p_in); ) {
  90     switch (state) {
  91     case S_COPY_PLAIN:
  92       p = strstr(p_in, MIMEHDR_INIT);
  93       if (p == NULL) {
  94         /*
  95          * No more coded data in buffer,
  96          * just move remainder into place.
  97          */
  98         i = strlen(p_in);   /* How much left */
  99         memmove(p_out, p_in, i);
 100         p_in += i; p_out += i;
 101       }
 102       else {
 103         /* MIME header init found at location p */
 104         if (p > p_in) {
 105           /* There are some uncoded chars at the beginning. */
 106           i = (p - p_in);
 107           memmove(p_out, p_in, i);
 108           p_out += i;
 109         }
 110         p_in = (p + 2);
 111         state = S_SKIP_MIMEINIT;
 112       }
 113       break;
 114
 115     case S_SKIP_MIMEINIT:
 116       /* Mime type definition: "charset?encoding?" */
 117       p = strchr(p_in, '?');
 118       if (p != NULL) {
 119         /* p_in .. (p-1) holds the charset */
 120
 121         /* *(p+1) is the transfer encoding, *(p+2) must be a '?' */
 122         if (*(p+2) == '?') {
 123           enc = tolower(*(p+1));
 124           p_in = p+3;
 125           state = S_COPY_MIME;
 126         }
 127         else
 128           state = S_COPY_PLAIN;
 129       }
 130       else
 131         state = S_COPY_PLAIN;   /* Invalid data */
 132       break;
 133
 134     case S_COPY_MIME:
 135       p = strstr(p_in, MIMEHDR_END);  /* Find end of coded data */
 136       if (p == NULL) p = p_in + strlen(p_in);
 137       for (; (p_in < p); ) {
 138         /* Decode all encoded data */
 139         if (enc == 'q') {
 140           if (*p_in == '=') {
 141             /* Decode one char qp-coded at (p_in+1) and (p_in+2) */
 142             if (qp_char(*(p_in+1), *(p_in+2), p_out) == 0)
 143               p_in += 3;
 144             else {
 145               /* Invalid QP data - pass through unchanged. */
 146               *p_out = *p_in;
 147               p_in++;
 148             }
 149           }
 150           else if (*p_in == '_') {
 151             /*
 152              * RFC 2047: '_' inside encoded word represents 0x20.
 153              * NOT a space - always the value 0x20.
 154              */
 155             *p_out = 0x20;
 156             p_in++;
 157           }
 158           else {
 159             /* Copy unchanged */
 160             *p_out = *p_in;
 161             p_in++;
 162           }
 163           p_out++;
 164         }
 165         else if (enc == 'b') {
 166           /* Decode base64 encoded data */
 167           char delimsave;
 168           int decoded_count;
 169
 170           delimsave = *p; *p = '\r';
 171           decoded_count = from64tobits(p_out, p_in);
 172           *p = delimsave;
 173           if (decoded_count > 0)
 174             p_out += decoded_count;
 175           p_in = p;
 176         }
 177         else {
 178           /* Copy unchanged */
 179           *p_out = *p_in;
 180           p_in++;
 181           p_out++;
 182         }
 183       }
 184       if (*p_in)
 185         p_in += 2;   /* Skip the MIMEHDR_END delimiter */
 186
 187       /*
 188        * We've completed decoding one encoded sequence. But another
 189        * may follow immediately, in which case whitespace before the
 190        * new MIMEHDR_INIT delimiter must be discarded.
 191        * See if that is the case
 192        */
 193       p = strstr(p_in, MIMEHDR_INIT);
 194       state = S_COPY_PLAIN;
 195       if (p != NULL) {
 196         /*
 197          * There is more MIME data later on. Is there
 198          * whitespace  only before the delimiter?
 199          */
 200         unsigned char *q;
 201         int  wsp_only = 1;
 202
 203         for (q=p_in; (wsp_only && (q < p)); q++)
 204           wsp_only = isspace(*q);
 205
 206         if (wsp_only) {
 207           /*
 208            * Whitespace-only before the MIME delimiter. OK,
 209            * just advance p_in to past the new MIMEHDR_INIT,
 210            * and prepare to process the new MIME charset/encoding
 211            * header.
 212            */
 213           p_in = p + strlen(MIMEHDR_INIT);
 214           state = S_SKIP_MIMEINIT;
 215         }
 216       }
 217       break;
 218     }
 219   }
 220
 221   *p_out = '\0';
 222 }
 223
 224
 225
 226 /*
 227  * Routines for decoding body-parts of a message.
 228  *
 229  * Since the "fetch" part of fetchmail gets a message body
 230  * one line at a time, we need to maintain some state variables
 231  * across multiple invokations of the UnMimeBodyline() routine.
 232  * The driver routine should call MimeBodyType() when all
 233  * headers have been received, and then UnMimeBodyline() for
 234  * every line in the message body.
 235  *
 236  */
 237 #define S_BODY_DATA 0
 238 #define S_BODY_HDR  1
 239
 240 /*
 241  * Flag indicating if we are currently processing
 242  * the headers or the body of a (multipart) message.
 243  */
 244 static int  BodyState = S_BODY_DATA;
 245
 246 /*
 247  * Flag indicating if we are in the process of decoding
 248  * a quoted-printable body part.
 249  */
 250 static int  CurrEncodingIsQP = 0;
 251
 252 /*
 253  * Delimiter for multipart messages. RFC 2046 states that this must
 254  * NEVER be longer than 70 characters. Add 3 for the two hyphens
 255  * at the beginning, and a terminating null.
 256  */
 257 #define MAX_DELIM_LEN 70
 258 static unsigned char MultipartDelimiter[MAX_DELIM_LEN+3];
 259
 260
 261 /* This string replaces the "Content-Transfer-Encoding: quoted-printable"
 262  * string in all headers, including those in body-parts. The replacement
 263  * must be no longer than the original string.
 264  */
 265 static const char ENC8BIT[] = "Content-Transfer-Encoding: 8bit";
 266 static void SetEncoding8bit(unsigned char *XferEncOfs)
 267 {
 268   unsigned char *p;
 269
 270   if (XferEncOfs != NULL) {
 271      memcpy(XferEncOfs, ENC8BIT, strlen(ENC8BIT));
 272
 273      /* If anything left, in this header, replace with whitespace */
 274      for (p=XferEncOfs+strlen(ENC8BIT); (*p >= ' '); p++) *p=' ';
 275   }
 276 }
 277
 278 static char *GetBoundary(char *CntType)
 279 {
 280   char *p1, *p2;
 281   int flag;
 282
 283   /* Find the "boundary" delimiter. It must be preceded with a ';'
 284    * and optionally some whitespace.
 285    */
 286   p1 = CntType;
 287   do {
 288     p2 = strchr(p1, ';');
 289     if (p2)
 290       for (p2++; isspace(*p2); p2++);
 291
 292     p1 = p2;
 293   } while ((p1) && (strncasecmp(p1, "boundary", 8) != 0));
 294
 295   if (p1 == NULL)
 296     /* No boundary delimiter */
 297     return NULL;
 298
 299   /* Skip "boundary", whitespace and '='; check that we do have a '=' */
 300   for (p1+=8, flag=0; (isspace(*p1) || (*p1 == '=')); p1++)
 301     flag |= (*p1 == '=');
 302   if (!flag)
 303     return NULL;
 304
 305   /* Find end of boundary delimiter string */
 306   if (*p1 == '\"') {
 307     /* The delimiter is inside quotes */
 308     p1++;
 309     p2 = strchr(p1, '\"');
 310     if (p2 == NULL)
 311       return NULL;  /* No closing '"' !?! */
 312   }
 313   else {
 314     /* There might be more text after the "boundary" string. */
 315     p2 = strchr(p1, ';');  /* Safe - delimiter with ';' must be in quotes */
 316   }
 317
 318   /* Zero-terminate the boundary string */
 319   if (p2 != NULL)
 320     *p2 = '\0';
 321
 322   return (p1 && strlen(p1)) ? p1 : NULL;
 323 }
 324
 325
 326 /*
 327  * This routine does three things:
 328  * 1) It determines - based on the message headers - whether the
 329  *    message body is a MIME message that may hold 8 bit data.
 330  *    - A message that has a "quoted-printable" or "8bit" transfer
 331  *      encoding is assumed to contain 8-bit data (when decoded).
 332  *    - A multipart message is assumed to contain 8-bit data
 333  *      when decoded (there might be quoted-printable body-parts).
 334  *    - All other messages are assumed NOT to include 8-bit data.
 335  * 2) It determines the delimiter-string used in multi-part message
 336  *    bodies.
 337  * 3) It sets the initial values of the CurrEncodingIsQP and BodyState
 338  *    variables, from the header contents.
 339  *
 340  * The return value is a bitmask.
 341  */
 342 int MimeBodyType(unsigned char *hdrs)
 343 {
 344   unsigned char *NxtHdr = hdrs;
 345   unsigned char *XferEnc, *XferEncOfs, *CntType, *MimeVer, *p;
 346   int  HdrsFound = 0;     /* We only look for three headers */
 347   int  BodyType;          /* Return value */
 348
 349   /* Setup for a standard (no MIME, no QP, 7-bit US-ASCII) message */
 350   MultipartDelimiter[0] = '\0';
 351   CurrEncodingIsQP = 0;
 352   BodyState = S_BODY_DATA;
 353   BodyType = 0;
 354
 355   /* Just in case ... */
 356   if (hdrs == NULL)
 357     return BodyType;
 358
 359   XferEnc = XferEncOfs = CntType = MimeVer = NULL;
 360
 361   do {
 362     if (strncasecmp("Content-Transfer-Encoding:", NxtHdr, 26) == 0) {
 363       XferEncOfs = NxtHdr;
 364       p = nxtaddr(NxtHdr);
 365       if (p != NULL) {
 366         XferEnc = (char *)xmalloc(strlen(p) + 1);
 367         strcpy(XferEnc, p);
 368         HdrsFound++;
 369       }
 370     }
 371     else if (strncasecmp("Content-Type:", NxtHdr, 13) == 0) {
 372       /*
 373        * This one is difficult. We cannot use the standard
 374        * nxtaddr() routine, since the boundary-delimiter is
 375        * (probably) enclosed in quotes - and thus appears
 376        * as an rfc822 comment, and nxtaddr() "eats" up any
 377        * spaces in the delimiter. So, we have to do this
 378        * by hand.
 379        */
 380
 381       /* Skip the "Content-Type:" part and whitespace after it */
 382       for (NxtHdr += 13; ((*NxtHdr == ' ') || (*NxtHdr == '\t')); NxtHdr++);
 383
 384       /*
 385        * Get the full value of the Content-Type header;
 386        * it might span multiple lines. So search for
 387        * a newline char, but ignore those that have a
 388        * have a TAB or space just after the NL (continued
 389        * lines).
 390        */
 391       p = NxtHdr-1;
 392       do {
 393         p=strchr((p+1),'\n');
 394       } while ( (p != NULL) && ((*(p+1) == '\t') || (*(p+1) == ' ')) );
 395       if (p == NULL) p = NxtHdr + strlen(NxtHdr);
 396
 397       CntType = (char *)xmalloc(p-NxtHdr+2);
 398       strncpy(CntType, NxtHdr, (p-NxtHdr));
 399       *(CntType+(p-NxtHdr)) = '\0';
 400       HdrsFound++;
 401     }
 402     else if (strncasecmp("MIME-Version:", NxtHdr, 13) == 0) {
 403       p = nxtaddr(NxtHdr);
 404       if (p != NULL) {
 405         MimeVer = (char *)xmalloc(strlen(p) + 1);
 406         strcpy(MimeVer, p);
 407         HdrsFound++;
 408       }
 409     }
 410
 411     NxtHdr = (strchr(NxtHdr, '\n'));
 412     if (NxtHdr != NULL) NxtHdr++;
 413   } while ((NxtHdr != NULL) && (*NxtHdr) && (HdrsFound != 3));
 414
 415
 416   /* Done looking through the headers, now check what they say */
 417   if ((MimeVer != NULL) && (strcmp(MimeVer, "1.0") == 0)) {
 418
 419     /* Check Content-Type to see if this is a multipart message */
 420     if ( (CntType != NULL) &&
 421          ((strncasecmp(CntType, "multipart/", 10) == 0) ||
 422           (strncasecmp(CntType, "message/", 8) == 0)) ) {
 423
 424       char *p1 = GetBoundary(CntType);
 425
 426       if (p1 != NULL) {
 427         /* The actual delimiter is "--" followed by
 428            the boundary string */
 429         strcpy(MultipartDelimiter, "--");
 430         strncat(MultipartDelimiter, p1, MAX_DELIM_LEN);
 431         BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
 432       }
 433     }
 434
 435     /*
 436      * Check Content-Transfer-Encoding, but
 437      * ONLY for non-multipart messages (BodyType == 0).
 438      */
 439     if ((XferEnc != NULL) && (BodyType == 0)) {
 440       if (strcasecmp(XferEnc, "quoted-printable") == 0) {
 441         CurrEncodingIsQP = 1;
 442         BodyType = (MSG_IS_8BIT | MSG_NEEDS_DECODE);
 443         SetEncoding8bit(XferEncOfs);
 444       }
 445       else if (strcasecmp(XferEnc, "7bit") == 0) {
 446         CurrEncodingIsQP = 0;
 447         BodyType = (MSG_IS_7BIT);
 448       }
 449       else if (strcasecmp(XferEnc, "8bit") == 0) {
 450         CurrEncodingIsQP = 0;
 451         BodyType = (MSG_IS_8BIT);
 452       }
 453     }
 454
 455   }
 456
 457   if (MimeVer) free(MimeVer);
 458   if (XferEnc) free(XferEnc);
 459   if (CntType) free(CntType);
 460
 461   return BodyType;
 462 }
 463
 464
 465 /*
 466  * Decode one line of data containing QP data.
 467  * Return flag set if this line ends with a soft line-break.
 468  * 'bufp' is modified to point to the end of the output buffer.
 469  */
 470 static int DoOneQPLine(unsigned char **bufp, int collapsedoubledot)
 471 {
 472   unsigned char *buf = *bufp;
 473   unsigned char *p_in, *p_out, *p;
 474   int n;
 475   int ret = 0;
 476
 477   p_in = buf;
 478   if (collapsedoubledot && (strncmp(buf, "..", 2) == 0))
 479     p_in++;
 480
 481   for (p_out = buf; (*p_in); ) {
 482     p = strchr(p_in, '=');
 483     if (p == NULL) {
 484       /* No more QP data, just move remainder into place */
 485       n = strlen(p_in);
 486       memmove(p_out, p_in, n);
 487       p_in += n; p_out += n;
 488     }
 489     else {
 490       if (p > p_in) {
 491         /* There are some uncoded chars at the beginning. */
 492         n = (p - p_in);
 493         memmove(p_out, p_in, n);
 494         p_out += n;
 495       }
 496
 497       switch (*(p+1)) {
 498       case '\0': case '\r': case '\n':
 499         /* Soft line break, skip '=' */
 500         p_in = p+1;
 501         if (*p_in == '\r') p_in++;
 502         if (*p_in == '\n') p_in++;
 503         ret = 1;
 504         break;
 505
 506       default:
 507         /* There is a QP encoded byte */
 508         if (qp_char(*(p+1), *(p+2), p_out) == 0) {
 509           p_in = p+3;
 510         }
 511         else {
 512           /* Invalid QP data - pass through unchanged. */
 513           *p_out = '=';
 514           p_in = p+1;
 515         }
 516         p_out++;
 517         break;
 518       }
 519     }
 520   }
 521
 522   *p_out = '\0';
 523   *bufp = p_out;
 524   return ret;
 525 }
 526
 527
 528 /* This is called once per line in the message body.  We need to scan
 529  * all lines in the message body for the multipart delimiter string,
 530  * and handle any body-part headers in such messages (these can toggle
 531  * qp-decoding on and off).
 532  *
 533  * Note: Messages that are NOT multipart-messages go through this
 534  * routine quickly, since BodyState will always be S_BODY_DATA,
 535  * and MultipartDelimiter is NULL.
 536  *
 537  * Return flag set if this line ends with a soft line-break.
 538  * 'bufp' is modified to point to the end of the output buffer.
 539  */
 540
 541 int UnMimeBodyline(unsigned char **bufp, int collapsedoubledot)
 542 {
 543   unsigned char *buf = *bufp;
 544   int ret = 0;
 545
 546   switch (BodyState) {
 547   case S_BODY_HDR:
 548     UnMimeHeader(buf);   /* Headers in body-parts can be encoded, too! */
 549     if (strncasecmp("Content-Transfer-Encoding:", buf, 26) == 0) {
 550       char *XferEnc;
 551
 552       XferEnc = nxtaddr(buf);
 553       if ((XferEnc != NULL) && (strcasecmp(XferEnc, "quoted-printable") == 0)) {
 554         CurrEncodingIsQP = 1;
 555         SetEncoding8bit(buf);
 556       }
 557     }
 558     else if ((*buf == '\0') || (*buf == '\n') || (strcmp(buf, "\r\n") == 0))
 559       BodyState = S_BODY_DATA;
 560
 561     *bufp = (buf + strlen(buf));
 562     break;
 563
 564   case S_BODY_DATA:
 565     if ((*MultipartDelimiter) &&
 566         (strncmp(buf, MultipartDelimiter, strlen(MultipartDelimiter)) == 0)) {
 567       BodyState = S_BODY_HDR;
 568       CurrEncodingIsQP = 0;
 569     }
 570
 571     if (CurrEncodingIsQP)
 572       ret = DoOneQPLine(bufp, collapsedoubledot);
 573     else
 574      *bufp = (buf + strlen(buf));
 575     break;
 576   }
 577
 578   return ret;
 579 }
 580
 581
 582 #ifdef STANDALONE
 583 #include <stdio.h>
 584 #include <unistd.h>
 585
 586 char *program_name = "unmime";
 587
 588 #define BUFSIZE_INCREMENT 4096
 589
 590 #ifdef DEBUG
 591 #define DBG_FWRITE(B,L,BS,FD) fwrite(B, L, BS, FD)
 592 #else
 593 #define DBG_FWRITE(B,L,BS,FD)
 594 #endif
 595
 596 int main(int argc, char *argv[])
 597 {
 598   unsigned int BufSize;
 599   unsigned char *buffer, *buf_p;
 600   int nl_count, i, bodytype;
 601
 602 #ifdef DEBUG
 603   pid_t pid;
 604   FILE *fd_orig, *fd_conv;
 605   char fnam[100];
 606
 607   pid = getpid();
 608   sprintf(fnam, "/tmp/i_unmime.%x", pid);
 609   fd_orig = fopen(fnam, "w");
 610   sprintf(fnam, "/tmp/o_unmime.%x", pid);
 611   fd_conv = fopen(fnam, "w");
 612 #endif
 613
 614   BufSize = BUFSIZE_INCREMENT;    /* Initial size of buffer */
 615   buf_p = buffer = (unsigned char *) xmalloc(BufSize);
 616   nl_count = 0;
 617
 618   do {
 619     i = fread(buf_p, 1, 1, stdin);
 620     switch (*buf_p) {
 621      case '\n':
 622        nl_count++;
 623        break;
 624
 625      case '\r':
 626        break;
 627
 628      default:
 629        nl_count = 0;
 630        break;
 631     }
 632
 633     buf_p++;
 634     if ((buf_p - buffer) == BufSize) {
 635        /* Buffer is full! Get more room. */
 636        buffer = xrealloc(buffer, BufSize+BUFSIZE_INCREMENT);
 637        buf_p = buffer + BufSize;
 638        BufSize += BUFSIZE_INCREMENT;
 639     }
 640   } while ((i > 0) && (nl_count < 2));
 641
 642   *buf_p = '\0';
 643   DBG_FWRITE(buffer, strlen(buffer), 1, fd_orig);
 644
 645   UnMimeHeader(buffer);
 646   bodytype = MimeBodyType(buffer);
 647
 648   i = strlen(buffer);
 649   fwrite(buffer, i, 1, stdout);
 650   DBG_FWRITE(buffer, i, 1, fd_conv);
 651
 652   do {
 653      buf_p = (buffer - 1);
 654      do {
 655         buf_p++;
 656         i = fread(buf_p, 1, 1, stdin);
 657      } while ((i == 1) && (*buf_p != '\n'));
 658      if (i == 1) buf_p++;
 659      *buf_p = '\0';
 660      DBG_FWRITE(buf, (buf_p - buffer), 1, fd_orig);
 661
 662      if (buf_p > buffer) {
 663         if (bodytype & MSG_NEEDS_DECODE) {
 664            buf_p = buffer;
 665            UnMimeBodyline(&buf_p, 0);
 666         }
 667         fwrite(buffer, (buf_p - buffer), 1, stdout);
 668         DBG_FWRITE(buffer, (buf_p - buffer), 1, fd_conv);
 669      }
 670   } while (buf_p > buffer);
 671
 672   free(buffer);
 673   fflush(stdout);
 674
 675 #ifdef DEBUG
 676   fclose(fd_orig);
 677   fclose(fd_conv);
 678 #endif
 679
 680   return 0;
 681 }
 682 #endif
 683