Extract PDF images with JavaScript
To extract images from a PDF using pdf.js in JavaScript, follow these steps:
- Load the PDF: Initialize pdf.js and load the PDF document.
- Access Each Page: For each page, use page.getOperatorList() to access its operators.
- Extract Images: Check the operators for paintImageXObject commands, where embedded images are stored.
- Render Image: Render the images onto a canvas for display or extraction.
Here’s an example setup using pdf.js
:
html
12345678910111213141516171819202122232425262728
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.10.377/pdf.min.js"></script>
<canvas id="imageCanvas"></canvas>
<script>
const url = 'path/to/your.pdf';
pdfjsLib.getDocument(url).promise.then(async pdf => {
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const operatorList = await page.getOperatorList();
operatorList.fnArray.forEach((fn, i) => {
if (fn === pdfjsLib.OPS.paintImageXObject) {
const imgName = operatorList.argsArray[i][0];
const img = page.objs.get(imgName);
renderImageToCanvas(img);
}
});
}
});
function renderImageToCanvas(img) {
const canvas = document.getElementById('imageCanvas');
const ctx = canvas.getContext('2d');
canvas.width = img.width;
canvas.height = img.height;
ctx.putImageData(img, 0, 0); // Render the image data onto canvas
}
</script>
This script locates and renders images in the PDF as separate canvases.