package org.apache.poi.ss.extractor;
import static org.apache.poi.util.StringUtil.endsWithIgnoreCase;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hpsf.ClassIDPredefined;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Drawing;
import org.apache.poi.ss.usermodel.ObjectData;
import org.apache.poi.ss.usermodel.Picture;
import org.apache.poi.ss.usermodel.PictureData;
import org.apache.poi.ss.usermodel.Shape;
import org.apache.poi.ss.usermodel.ShapeContainer;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.util.Beta;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
@Beta
public class implements Iterable<EmbeddedExtractor> {
private static final POILogger = POILogFactory.getLogger(EmbeddedExtractor.class);
private static final int = 1_000_000;
private static final String = "binary/octet-stream";
private static final String = "application/pdf";
private static final String = "application/msword";
private static final String = "application/vnd.ms-excel";
@Override
public Iterator<EmbeddedExtractor> () {
EmbeddedExtractor[] ee = {
new Ole10Extractor(), new PdfExtractor(), new BiffExtractor(), new OOXMLExtractor(), new FsExtractor()
};
return Arrays.asList(ee).iterator();
}
public EmbeddedData (DirectoryNode src) throws IOException {
for (EmbeddedExtractor ee : this) {
if (ee.canExtract(src)) {
return ee.extract(src);
}
}
return null;
}
public EmbeddedData (Picture src) throws IOException {
for (EmbeddedExtractor ee : this) {
if (ee.canExtract(src)) {
return ee.extract(src);
}
}
return null;
}
public List<EmbeddedData> (Sheet sheet) throws IOException {
Drawing<?> patriarch = sheet.getDrawingPatriarch();
if (null == patriarch){
return Collections.emptyList();
}
List<EmbeddedData> embeddings = new ArrayList<>();
extractAll(patriarch, embeddings);
return embeddings;
}
protected void (ShapeContainer<?> parent, List<EmbeddedData> embeddings) throws IOException {
for (Shape shape : parent) {
EmbeddedData data = null;
if (shape instanceof ObjectData) {
ObjectData od = (ObjectData)shape;
try {
if (od.hasDirectoryEntry()) {
data = extractOne((DirectoryNode)od.getDirectory());
} else {
data = new EmbeddedData(od.getFileName(), od.getObjectData(), od.getContentType());
}
} catch (Exception e) {
LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
}
} else if (shape instanceof Picture) {
data = extractOne((Picture)shape);
} else if (shape instanceof ShapeContainer) {
extractAll((ShapeContainer<?>)shape, embeddings);
}
if (data == null) {
continue;
}
data.setShape(shape);
String filename = data.getFilename();
String extension = (filename == null || filename.lastIndexOf('.') == -1) ? ".bin" : filename.substring(filename.lastIndexOf('.'));
if (filename == null || filename.isEmpty() || filename.startsWith("MBD") || filename.startsWith("Root Entry")) {
filename = shape.getShapeName();
if (filename != null) {
filename += extension;
}
}
if (filename == null || filename.isEmpty()) {
filename = "picture_" + embeddings.size() + extension;
}
filename = filename.trim();
data.setFilename(filename);
embeddings.add(data);
}
}
public boolean (DirectoryNode source) {
return false;
}
public boolean (Picture source) {
return false;
}
protected EmbeddedData (DirectoryNode dn) throws IOException {
assert(canExtract(dn));
ByteArrayOutputStream bos = new ByteArrayOutputStream(20000);
try (POIFSFileSystem dest = new POIFSFileSystem()) {
copyNodes(dn, dest.getRoot());
dest.writeFilesystem(bos);
}
return new EmbeddedData(dn.getName(), bos.toByteArray(), CONTENT_TYPE_BYTES);
}
protected EmbeddedData (Picture source) throws IOException {
return null;
}
public static class extends EmbeddedExtractor {
@Override
public boolean (DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return ClassIDPredefined.lookup(clsId) == ClassIDPredefined.OLE_V1_PACKAGE;
}
@Override
public EmbeddedData (DirectoryNode dn) throws IOException {
try {
Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), CONTENT_TYPE_BYTES);
} catch (Ole10NativeException e) {
throw new IOException(e);
}
}
}
static class extends EmbeddedExtractor {
static ClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}");
@Override
public boolean (DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (PdfClassID.equals(clsId) || dn.hasEntry("CONTENTS"));
}
@Override
public EmbeddedData (DirectoryNode dn) throws IOException {
try(ByteArrayOutputStream bos = new ByteArrayOutputStream();
InputStream is = dn.createDocumentInputStream("CONTENTS")) {
IOUtils.copy(is, bos);
return new EmbeddedData(dn.getName() + ".pdf", bos.toByteArray(), CONTENT_TYPE_PDF);
}
}
@Override
public boolean (Picture source) {
PictureData pd = source.getPictureData();
return (pd != null && pd.getPictureType() == Workbook.PICTURE_TYPE_EMF);
}
@Override
protected EmbeddedData (Picture source) throws IOException {
PictureData pd = source.getPictureData();
if (pd == null || pd.getPictureType() != Workbook.PICTURE_TYPE_EMF) {
return null;
}
byte[] pictureBytes = pd.getData();
int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes(LocaleUtil.CHARSET_1252));
if (idxStart == -1) {
return null;
}
int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes(LocaleUtil.CHARSET_1252));
if (idxEnd == -1) {
return null;
}
int pictureBytesLen = idxEnd-idxStart+6;
byte[] pdfBytes = IOUtils.safelyAllocate(pictureBytesLen, MAX_RECORD_LENGTH);
System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
String filename = source.getShapeName().trim();
if (!endsWithIgnoreCase(filename, ".pdf")) {
filename += ".pdf";
}
return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF);
}
}
static class extends EmbeddedExtractor {
@Override
public boolean (DirectoryNode dn) {
return dn.hasEntry("package");
}
@Override
public EmbeddedData (DirectoryNode dn) throws IOException {
ClassIDPredefined clsId = ClassIDPredefined.lookup(dn.getStorageClsid());
String contentType = null;
String ext = null;
if (clsId != null) {
contentType = clsId.getContentType();
ext = clsId.getFileExtension();
}
if (contentType == null || ext == null) {
contentType = "application/zip";
ext = ".zip";
}
DocumentInputStream dis = dn.createDocumentInputStream("package");
byte[] data = IOUtils.toByteArray(dis);
dis.close();
return new EmbeddedData(dn.getName()+ext, data, contentType);
}
}
static class extends EmbeddedExtractor {
@Override
public boolean (DirectoryNode dn) {
return canExtractExcel(dn) || canExtractWord(dn);
}
protected boolean (DirectoryNode dn) {
ClassIDPredefined clsId = ClassIDPredefined.lookup(dn.getStorageClsid());
return (ClassIDPredefined.EXCEL_V7 == clsId
|| ClassIDPredefined.EXCEL_V8 == clsId
|| dn.hasEntry("Workbook") );
}
protected boolean (DirectoryNode dn) {
ClassIDPredefined clsId = ClassIDPredefined.lookup(dn.getStorageClsid());
return (ClassIDPredefined.WORD_V7 == clsId
|| ClassIDPredefined.WORD_V8 == clsId
|| dn.hasEntry("WordDocument"));
}
@Override
public EmbeddedData (DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
if (canExtractExcel(dn)) {
ed.setFilename(dn.getName() + ".xls");
ed.setContentType(CONTENT_TYPE_XLS);
} else if (canExtractWord(dn)) {
ed.setFilename(dn.getName() + ".doc");
ed.setContentType(CONTENT_TYPE_DOC);
}
return ed;
}
}
static class extends EmbeddedExtractor {
@Override
public boolean (DirectoryNode dn) {
return true;
}
@Override
public EmbeddedData (DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
ed.setFilename(dn.getName() + ".ole");
return ed;
}
}
protected static void (DirectoryNode src, DirectoryNode dest) throws IOException {
for (Entry e : src) {
if (e instanceof DirectoryNode) {
DirectoryNode srcDir = (DirectoryNode)e;
DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName());
destDir.setStorageClsid(srcDir.getStorageClsid());
copyNodes(srcDir, destDir);
} else {
try (InputStream is = src.createDocumentInputStream(e)) {
dest.createDocument(e.getName(), is);
}
}
}
}
private static int (byte[] data, int offset, byte[] pattern) {
int[] failure = computeFailure(pattern);
int j = 0;
if (data.length == 0) {
return -1;
}
for (int i = offset; i < data.length; i++) {
while (j > 0 && pattern[j] != data[i]) {
j = failure[j - 1];
}
if (pattern[j] == data[i]) { j++; }
if (j == pattern.length) {
return i - pattern.length + 1;
}
}
return -1;
}
private static int[] (byte[] pattern) {
int[] failure = new int[pattern.length];
int j = 0;
for (int i = 1; i < pattern.length; i++) {
while (j > 0 && pattern[j] != pattern[i]) {
j = failure[j - 1];
}
if (pattern[j] == pattern[i]) {
j++;
}
failure[i] = j;
}
return failure;
}
}