Skip to content

Commit

Permalink
🚀 Use probe-image-size to speed up fetching image dimensions
Browse files Browse the repository at this point in the history
no issue

Fetching images across the network to get their dimensions is really slow. In the previous implementation we were fetching the entire image before using it to determine the image dimensions. However, for most images we don't need all of the data! Image dimensions can usually be determined from the headers or other information that is stored near the beginning of an image file, we should take advantage of that to massively speed up our dimension calculations and save a lot of network bandwidth.

- use `probe-image-size` to fetch only enough of an image that is needed to get the dimensions
- fall back to `image-size` for formats which `probe-image-size` doesn't support
- replace `got` with `request` so we're not including two different request libraries
- use `request-promise` wrapper library to simplify the code (uses `bluebird` under the hood which is the same as the `async` library that we're using

As an example of the speedup offered, here are my tests which were using this library inside Ghost. The test page had 10 reasonably sized external images and the reported time is the initial amperization before caching.

```
image-size only:
INFO amp.parse http://ghost.blog/2019/06/08/test/ 52278ms
INFO amp.parse http://ghost.blog/2019/06/08/test/ 52717ms
INFO amp.parse http://ghost.blog/2019/06/08/test/ 50582ms

average: 51,859ms

probe-image-size w/ image-size fallback:
INFO amp.parse http://ghost.blog/2019/06/08/test/ 11147ms
INFO amp.parse http://ghost.blog/2019/06/08/test/ 12297ms
INFO amp.parse http://ghost.blog/2019/06/08/test/ 11188ms

average: 11,544ms
```

Result: ~4.5x faster
  • Loading branch information
kevinansfield authored and aileen committed Jun 10, 2019
1 parent 99064b8 commit c9879b1
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 99 deletions.
115 changes: 75 additions & 40 deletions lib/amperize.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ var EventEmitter = require('events').EventEmitter,
uuid = require('uuid'),
async = require('async'),
url = require('url'),
got = require('got'),
request = require('request-promise'),
probeImageSize = require('probe-image-size'),
_ = require('lodash'),
sizeOf = require('image-size'),
validator = require('validator'),
Expand Down Expand Up @@ -105,6 +106,16 @@ Amperize.prototype.amperizer = function amperizer(id, error, dom) {
*/
Amperize.prototype.traverse = function traverse(data, html, done) {
var self = this;
var imageSizeCache = {};
var timeout = 3000;
var requestOptions = {
// We need the user-agent, otherwise some https request may fail (e. g. cloudfare)
headers: {
'User-Agent': 'Mozilla/5.0 Safari/537.36'
},
timeout: timeout,
encoding: null
};

async.reduce(data, html, function reduce(html, element, step) {
var children;
Expand Down Expand Up @@ -158,62 +169,86 @@ Amperize.prototype.traverse = function traverse(data, html, done) {
return enter();
}

// probe will fetch the minimal amount of data needed to determine
// the image dimensions so it's more performant than a full fetch
function _probeImageSize(url) {
return probeImageSize(
url,
requestOptions
).then(function (result) {
imageSizeCache[url] = result;
return result;
});
}

// fetch the full image before reading dimensions using image-size,
// it's slower but has better format support
function _fetchImageSize(url) {
return request(
url,
Object.assign({}, requestOptions, {
encoding: null
})
).then(function (response) {
var result = sizeOf(response);
imageSizeCache[url] = result;
return result;
});
}

// select appropriate method to get image size
function _getImageSize(url) {
var [, extension] = url.match(/(?:\.)([a-zA-Z]{3,4})$/) || [];

// use cached image size if we've already seen this url
if (imageSizeCache[url]) {
return Promise.resolve(imageSizeCache[url]);
}

// // fetch full image for formats we can't probe
if (['cur', 'icns', 'ico', 'dds'].includes(extension)) {
return _fetchImageSize(url);
}

// // probe partial image everything else
return _probeImageSize(url);
}

/**
* Get the image sizes (width and heigth plus type of image)
* Get the image sizes (width and height plus type of image)
*
* https://github.com/image-size/image-size
* https://github.com/nodeca/probe-image-size
*
* @param {Object} element
* @return {Object} element incl. width and height
*/
function getImageSize(element) {
var imageObj = url.parse(element.attribs.src),
requestOptions,
timeout = 3000;
var imageObj = url.parse(element.attribs.src);

if (!validator.isURL(imageObj.href)) {
// revert this element, do not show
element.name = 'img';

return enter();
}

// We need the user-agent, otherwise some https request may fail (e. g. cloudfare)
requestOptions = {
headers: {
'User-Agent': 'Mozilla/5.0 Safari/537.36'
},
timeout: timeout,
retry: 0,
encoding: null
};

return got(
imageObj.href,
requestOptions
).then(function (response) {
try {
// Using the Buffer rather than an URL requires to use sizeOf synchronously.
// See https://github.com/image-size/image-size#asynchronous
var dimensions = sizeOf(response.body);

// CASE: `.ico` files might have multiple images and therefore multiple sizes.
// We return the largest size found (image-size default is the first size found)
if (dimensions.images) {
dimensions.width = _.maxBy(dimensions.images, function (w) {return w.width;}).width;
dimensions.height = _.maxBy(dimensions.images, function (h) {return h.height;}).height;
}

element.attribs.width = dimensions.width;
element.attribs.height = dimensions.height;

return getLayoutAttribute(element);
} catch (err) {
// revert this element, do not show
return _getImageSize(imageObj.href).then(function (result) {
if ((!result.width || !result.height) && !result.images) {
element.name = 'img';
return enter();
}
}).catch(function () {

// CASE: `.ico` files might have multiple images and therefore multiple sizes.
// We return the largest size found (image-size default is the first size found)
if (result.images) {
result.width = _.maxBy(result.images, function (w) {return w.width;}).width;
result.height = _.maxBy(result.images, function (h) {return h.height;}).height;
}

element.attribs.width = result.width;
element.attribs.height = result.height;

return getLayoutAttribute(element);
}).catch(function (err) {
// revert this element, do not show
element.name = 'img';
return enter();
Expand Down
12 changes: 7 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@
"dependencies": {
"async": "^2.1.4",
"emits": "^3.0.0",
"got": "^9.6.0",
"htmlparser2": "^3.9.2",
"image-size": "0.6.1",
"image-size": "^0.7.4",
"lodash": "^4.17.4",
"probe-image-size": "^4.0.0",
"request": "^2.83.0",
"request-promise": "^4.2.4",
"uuid": "^3.0.0",
"validator": "^9.1.1"
},
Expand All @@ -42,11 +44,11 @@
"cz-conventional-changelog": "2.1.0",
"istanbul": "^0.4.5",
"mocha": "^4.0.1",
"nock": "^9.0.2",
"rewire": "^2.5.2",
"semantic-release": "9.1.0",
"sinon": "1.17.7",
"sinon-chai": "^2.8.0",
"nock": "^9.0.2",
"rewire": "^2.5.2"
"sinon-chai": "^2.8.0"
},
"config": {
"commitizen": {
Expand Down
Loading

0 comments on commit c9879b1

Please sign in to comment.