/*
 * Copyright (c) 1997, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0, which is available at
 * http://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

package org.jvnet.mimepull;

import java.io.InputStream;
import java.io.IOException;
import java.util.*;
import java.util.logging.Logger;
import java.nio.ByteBuffer;
import java.util.logging.Level;

Pull parser for the MIME messages. Applications can use pull API to continue the parsing MIME messages lazily.
for e.g.:

MIMEParser parser = ... Iterator it = parser.iterator(); while(it.hasNext()) { MIMEEvent event = it.next(); ... }

Author:Jitendra Kotamraju
/** * Pull parser for the MIME messages. Applications can use pull API to continue * the parsing MIME messages lazily. * * <pre> * for e.g.: * <p> * * MIMEParser parser = ... * Iterator<MIMEEvent> it = parser.iterator(); * while(it.hasNext()) { * MIMEEvent event = it.next(); * ... * } * </pre> * * @author Jitendra Kotamraju */
class MIMEParser implements Iterable<MIMEEvent> { private static final Logger LOGGER = Logger.getLogger(MIMEParser.class.getName()); private static final String HEADER_ENCODING = "ISO8859-1"; // Actually, the grammar doesn't support whitespace characters // after boundary. But the mail implementation checks for it. // We will only check for these many whitespace characters after boundary private static final int NO_LWSP = 1000; private enum STATE {START_MESSAGE, SKIP_PREAMBLE, START_PART, HEADERS, BODY, END_PART, END_MESSAGE} private STATE state = STATE.START_MESSAGE; private final InputStream in; private final byte[] bndbytes; private final int bl; private final MIMEConfig config; private final int[] bcs = new int[128]; // BnM algo: Bad Character Shift table private final int[] gss; // BnM algo : Good Suffix Shift table
Have we parsed the data from our InputStream yet?
/** * Have we parsed the data from our InputStream yet? */
private boolean parsed; /* * Read and process body partsList until we see the * terminating boundary line (or EOF). */ private boolean done = false; private boolean eof; private final int capacity; private byte[] buf; private int len; private boolean bol; // beginning of the line /* * Parses the MIME content. At the EOF, it also closes input stream */ MIMEParser(InputStream in, String boundary, MIMEConfig config) { this.in = in; this.bndbytes = getBytes("--"+boundary); bl = bndbytes.length; this.config = config; gss = new int[bl]; compileBoundaryPattern(); // \r\n + boundary + "--\r\n" + lots of LWSP capacity = config.chunkSize+2+bl+4+NO_LWSP; createBuf(capacity); }
Returns iterator for the parsing events. Use the iterator to advance the parsing.
Returns:iterator for parsing events
/** * Returns iterator for the parsing events. Use the iterator to advance * the parsing. * * @return iterator for parsing events */
@Override public Iterator<MIMEEvent> iterator() { return new MIMEEventIterator(); } class MIMEEventIterator implements Iterator<MIMEEvent> { @Override public boolean hasNext() { return !parsed; } @Override public MIMEEvent next() { if (parsed) { throw new NoSuchElementException(); } switch(state) { case START_MESSAGE : if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_MESSAGE);} state = STATE.SKIP_PREAMBLE; return MIMEEvent.START_MESSAGE; case SKIP_PREAMBLE : if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.SKIP_PREAMBLE);} skipPreamble(); // fall through case START_PART : if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_PART);} state = STATE.HEADERS; return MIMEEvent.START_PART; case HEADERS : if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.HEADERS);} InternetHeaders ih = readHeaders(); state = STATE.BODY; bol = true; return new MIMEEvent.Headers(ih); case BODY : if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.BODY);} ByteBuffer buf = readBody(); bol = false; return new MIMEEvent.Content(buf); case END_PART : if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_PART);} if (done) { state = STATE.END_MESSAGE; } else { state = STATE.START_PART; } return MIMEEvent.END_PART; case END_MESSAGE : if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_MESSAGE);} parsed = true; return MIMEEvent.END_MESSAGE; default : throw new MIMEParsingException("Unknown Parser state = "+state); } } @Override public void remove() { throw new UnsupportedOperationException(); } }
Collects the headers for the current part by parsing mesage stream.
Returns:headers for the current part
/** * Collects the headers for the current part by parsing mesage stream. * * @return headers for the current part */
private InternetHeaders readHeaders() { if (!eof) { fillBuf(); } return new InternetHeaders(new LineInputStream()); }
Reads and saves the part of the current attachment part's content. At the end of this method, buf should have the remaining data at index 0.
Returns:a chunk of the part's content
/** * Reads and saves the part of the current attachment part's content. * At the end of this method, buf should have the remaining data * at index 0. * * @return a chunk of the part's content * */
private ByteBuffer readBody() { if (!eof) { fillBuf(); } int start = match(buf, 0, len); // matches boundary if (start == -1) { // No boundary is found assert eof || len >= config.chunkSize; int chunkSize = eof ? len : config.chunkSize; if (eof) { done = true; throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary."); } return adjustBuf(chunkSize, len-chunkSize); } // Found boundary. // Is it at the start of a line ? int chunkLen = start; if (bol && start == 0) { // nothing to do } else if (start > 0 && (buf[start-1] == '\n' || buf[start-1] =='\r')) { --chunkLen; if (buf[start-1] == '\n' && start >1 && buf[start-2] == '\r') { --chunkLen; } } else { return adjustBuf(start+1, len-start-1); // boundary is not at beginning of a line } if (start+bl+1 < len && buf[start+bl] == '-' && buf[start+bl+1] == '-') { state = STATE.END_PART; done = true; return adjustBuf(chunkLen, 0); } // Consider all the whitespace in boundary+whitespace+"\r\n" int lwsp = 0; for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) { ++lwsp; } // Check for \n or \r\n in boundary+whitespace+"\n" or boundary+whitespace+"\r\n" if (start+bl+lwsp < len && buf[start+bl+lwsp] == '\n') { state = STATE.END_PART; return adjustBuf(chunkLen, len-start-bl-lwsp-1); } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp] == '\r' && buf[start+bl+lwsp+1] == '\n') { state = STATE.END_PART; return adjustBuf(chunkLen, len-start-bl-lwsp-2); } else if (start+bl+lwsp+1 < len) { return adjustBuf(chunkLen+1, len-chunkLen-1); // boundary string in a part data } else if (eof) { done = true; throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary."); } // Some more data needed to determine if it is indeed a proper boundary return adjustBuf(chunkLen, len-chunkLen); }
Returns a chunk from the original buffer. A new buffer is created with the remaining bytes.
Params:
  • chunkSize – create a chunk with these many bytes
  • remaining – bytes from the end of the buffer that need to be copied to the beginning of the new buffer
Returns:chunk
/** * Returns a chunk from the original buffer. A new buffer is * created with the remaining bytes. * * @param chunkSize create a chunk with these many bytes * @param remaining bytes from the end of the buffer that need to be copied to * the beginning of the new buffer * @return chunk */
private ByteBuffer adjustBuf(int chunkSize, int remaining) { assert buf != null; assert chunkSize >= 0; assert remaining >= 0; byte[] temp = buf; // create a new buf and adjust it without this chunk createBuf(remaining); System.arraycopy(temp, len-remaining, buf, 0, remaining); len = remaining; return ByteBuffer.wrap(temp, 0, chunkSize); } private void createBuf(int min) { buf = new byte[min < capacity ? capacity : min]; }
Skips the preamble to find the first attachment part
/** * Skips the preamble to find the first attachment part */
private void skipPreamble() { while(true) { if (!eof) { fillBuf(); } int start = match(buf, 0, len); // matches boundary if (start == -1) { // No boundary is found if (eof) { throw new MIMEParsingException("Missing start boundary"); } else { adjustBuf(len-bl+1, bl-1); continue; } } if (start > config.chunkSize) { adjustBuf(start, len-start); continue; } // Consider all the whitespace boundary+whitespace+"\r\n" int lwsp = 0; for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) { ++lwsp; } // Check for \n or \r\n if (start+bl+lwsp < len && (buf[start+bl+lwsp] == '\n' || buf[start+bl+lwsp] == '\r') ) { if (buf[start+bl+lwsp] == '\n') { adjustBuf(start+bl+lwsp+1, len-start-bl-lwsp-1); break; } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp+1] == '\n') { adjustBuf(start+bl+lwsp+2, len-start-bl-lwsp-2); break; } } adjustBuf(start+1, len-start-1); } if (LOGGER.isLoggable(Level.FINE)) {LOGGER.log(Level.FINE, "Skipped the preamble. buffer len={0}", len);} } private static byte[] getBytes(String s) { char [] chars= s.toCharArray(); int size = chars.length; byte[] bytes = new byte[size]; for (int i = 0; i < size;) { bytes[i] = (byte) chars[i++]; } return bytes; }
Boyer-Moore search method. Copied from java.util.regex.Pattern.java Pre calculates arrays needed to generate the bad character shift and the good suffix shift. Only the last seven bits are used to see if chars match; This keeps the tables small and covers the heavily used ASCII range, but occasionally results in an aliased match for the bad character shift.
/** * Boyer-Moore search method. Copied from java.util.regex.Pattern.java * * Pre calculates arrays needed to generate the bad character * shift and the good suffix shift. Only the last seven bits * are used to see if chars match; This keeps the tables small * and covers the heavily used ASCII range, but occasionally * results in an aliased match for the bad character shift. */
private void compileBoundaryPattern() { int i, j; // Precalculate part of the bad character shift // It is a table for where in the pattern each // lower 7-bit value occurs for (i = 0; i < bndbytes.length; i++) { bcs[bndbytes[i]&0x7F] = i + 1; } // Precalculate the good suffix shift // i is the shift amount being considered NEXT: for (i = bndbytes.length; i > 0; i--) { // j is the beginning index of suffix being considered for (j = bndbytes.length - 1; j >= i; j--) { // Testing for good suffix if (bndbytes[j] == bndbytes[j-i]) { // src[j..len] is a good suffix gss[j-1] = i; } else { // No match. The array has already been // filled up with correct values before. continue NEXT; } } // This fills up the remaining of optoSft // any suffix can not have larger shift amount // then its sub-suffix. Why??? while (j > 0) { gss[--j] = i; } } // Set the guard value because of unicode compression gss[bndbytes.length -1] = 1; }
Finds the boundary in the given buffer using Boyer-Moore algo. Copied from java.util.regex.Pattern.java
Params:
  • mybuf – boundary to be searched in this mybuf
  • off – start index in mybuf
  • len – number of bytes in mybuf
Returns:-1 if there is no match or index where the match starts
/** * Finds the boundary in the given buffer using Boyer-Moore algo. * Copied from java.util.regex.Pattern.java * * @param mybuf boundary to be searched in this mybuf * @param off start index in mybuf * @param len number of bytes in mybuf * * @return -1 if there is no match or index where the match starts */
private int match(byte[] mybuf, int off, int len) { int last = len - bndbytes.length; // Loop over all possible match positions in text NEXT: while (off <= last) { // Loop over pattern from right to left for (int j = bndbytes.length - 1; j >= 0; j--) { byte ch = mybuf[off+j]; if (ch != bndbytes[j]) { // Shift search to the right by the maximum of the // bad character shift and the good suffix shift off += Math.max(j + 1 - bcs[ch&0x7F], gss[j]); continue NEXT; } } // Entire pattern matched starting at off return off; } return -1; }
Fills the remaining buf to the full capacity
/** * Fills the remaining buf to the full capacity */
private void fillBuf() { if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "Before fillBuf() buffer len={0}", len);} assert !eof; while(len < buf.length) { int read; try { read = in.read(buf, len, buf.length-len); } catch(IOException ioe) { throw new MIMEParsingException(ioe); } if (read == -1) { eof = true; try { if (LOGGER.isLoggable(Level.FINE)) {LOGGER.fine("Closing the input stream.");} in.close(); } catch(IOException ioe) { throw new MIMEParsingException(ioe); } break; } else { len += read; } } if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "After fillBuf() buffer len={0}", len);} } private void doubleBuf() { byte[] temp = new byte[2*len]; System.arraycopy(buf, 0, temp, 0, len); buf = temp; if (!eof) { fillBuf(); } } class LineInputStream { private int offset; /* * Read a line containing only ASCII characters from the input * stream. A line is terminated by a CR or NL or CR-NL sequence. * A common error is a CR-CR-NL sequence, which will also terminate * a line. * The line terminator is not returned as part of the returned * String. Returns null if no data is available. <p> * * This class is similar to the deprecated * <code>DataInputStream.readLine()</code> */ public String readLine() throws IOException { int hdrLen = 0; int lwsp = 0; while(offset+hdrLen < len) { if (buf[offset+hdrLen] == '\n') { lwsp = 1; break; } if (offset+hdrLen+1 == len) { doubleBuf(); } if (offset+hdrLen+1 >= len) { // No more data in the stream assert eof; return null; } if (buf[offset+hdrLen] == '\r' && buf[offset+hdrLen+1] == '\n') { lwsp = 2; break; } ++hdrLen; } if (hdrLen == 0) { adjustBuf(offset+lwsp, len-offset-lwsp); return null; } String hdr = new String(buf, offset, hdrLen, HEADER_ENCODING); offset += hdrLen+lwsp; return hdr; } } }